No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

103 líneas
3.4KB

  1. import pandas as pd
  2. from functools import reduce
  3. import sys, argparse, os
  4. import glob
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-pre', '--pre_alignment_path', type=str, help='pre-alignment directory')
  7. parser.add_argument('-post', '--post_alignment_path', type=str, help='post-alignment directory')
  8. parser.add_argument('-vcf', '--benchmark_path', type=str, help='benchmark directory')
  9. parser.add_argument('-men', '--mendelian_path', type=str, help='mendelian directory')
  10. args = parser.parse_args()
  11. # Rename input:
  12. pre_alignment_path = args.pre_alignment_path
  13. post_alignment_path = args.post_alignment_path
  14. benchmark_path = args.benchmark_path
  15. mendelian_path = args.mendelian_path
  16. ###pre-alignment
  17. pre_alignment_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(pre_alignment_path,'*.txt'))))
  18. pre_alignment_df.to_csv('pre-alignment-all.txt',sep="\t",index=0)
  19. ###post-alignment
  20. post_alignment_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(post_alignment_path,'*.txt'))))
  21. post_alignment_df.to_csv('post-alignment-all.txt',sep="\t",index=0)
  22. ###variant-calling qc
  23. ## detail
  24. variant_calling_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(benchmark_path,'*.txt'))))
  25. variant_calling_df.to_csv('variant-calling-all.txt',sep="\t",index=0)
  26. ## mean + sd
  27. ####snv
  28. a = variant_calling_df["SNV precision"].mean().round(2)
  29. b = variant_calling_df["SNV precision"].std().round(2)
  30. c = variant_calling_df["SNV recall"].mean().round(2)
  31. d = variant_calling_df["SNV precision"].std().round(2)
  32. variant_calling_df["SNV F1 score"] = 2 * variant_calling_df["SNV precision"] * variant_calling_df["SNV recall"] / (variant_calling_df["SNV precision"] + variant_calling_df["SNV recall"])
  33. e = variant_calling_df["SNV F1 score"].mean().round(2)
  34. f = variant_calling_df["SNV F1 score"].std().round(2)
  35. #### indel
  36. a2 = variant_calling_df["INDEL precision"].mean().round(2)
  37. b2 = variant_calling_df["INDEL precision"].std().round(2)
  38. c2 = variant_calling_df["INDEL recall"].mean().round(2)
  39. d2 = variant_calling_df["INDEL precision"].std().round(2)
  40. variant_calling_df["INDEL F1 score"] = 2 * variant_calling_df["INDEL precision"] * variant_calling_df["INDEL recall"] / (variant_calling_df["INDEL precision"] + variant_calling_df["INDEL recall"])
  41. e2 = variant_calling_df["INDEL F1 score"].mean().round(2)
  42. f2 = variant_calling_df["INDEL F1 score"].std().round(2)
  43. data = {'precision':[str(a),str(a2)], 'precision_sd':[str(b),str(b2)], 'recall':[str(c),str(c2)], 'recall_sd':[str(d),str(d2)], 'F1-score':[str(e),str(e2)], 'F1-score_sd':[str(f),str(f2)] }
  44. df = pd.DataFrame(data, index =['SNV', 'INDEL'])
  45. df.to_csv('benchmark_summary.txt',sep="\t")
  46. ### Mendelian
  47. mendelian_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(mendelian_path,'*.txt'))))
  48. mendelian_df.to_csv('mendelian-all.txt',sep="\t",index=0)
  49. ### snv
  50. snv = mendelian_df[mendelian_df['Family'].str.contains("SNV")]
  51. indel = mendelian_df[mendelian_df['Family'].str.contains("INDEL")]
  52. g = snv["Mendelian_Concordance_Rate"].mean().round(2)
  53. h = snv["Mendelian_Concordance_Rate"].std().round(2)
  54. g2 = indel["Mendelian_Concordance_Rate"].mean().round(2)
  55. h2 = indel["Mendelian_Concordance_Rate"].std().round(2)
  56. data = {'MCR':[str(g),str(g2)], 'MCR_sd':[str(h),str(h2)]}
  57. df = pd.DataFrame(data, index =['SNV', 'INDEL'])
  58. df.to_csv('mendelian_summary.txt',sep="\t")