You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

103 lines
3.4KB

  1. import pandas as pd
  2. from functools import reduce
  3. import sys, argparse, os
  4. import glob
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-pre', '--pre_alignment_path', type=str, help='pre-alignment directory')
  7. parser.add_argument('-post', '--post_alignment_path', type=str, help='post-alignment directory')
  8. parser.add_argument('-vcf', '--benchmark_path', type=str, help='benchmark directory')
  9. parser.add_argument('-men', '--mendelian_path', type=str, help='mendelian directory')
  10. args = parser.parse_args()
  11. # Rename input:
  12. pre_alignment_path = args.pre_alignment_path
  13. post_alignment_path = args.post_alignment_path
  14. benchmark_path = args.benchmark_path
  15. mendelian_path = args.mendelian_path
  16. ###pre-alignment
  17. pre_alignment_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(pre_alignment_path,'*.txt'))))
  18. pre_alignment_df.to_csv('pre-alignment-all.txt',sep="\t",index=0)
  19. ###post-alignment
  20. post_alignment_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(post_alignment_path,'*.txt'))))
  21. post_alignment_df.to_csv('post-alignment-all.txt',sep="\t",index=0)
  22. ###variant-calling qc
  23. ## detail
  24. variant_calling_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(benchmark_path,'*.txt'))))
  25. variant_calling_df.to_csv('variant-calling-all.txt',sep="\t",index=0)
  26. ## mean + sd
  27. ####snv
  28. a = variant_calling_df["SNV precision"].mean().round(2)
  29. b = variant_calling_df["SNV precision"].std().round(2)
  30. c = variant_calling_df["SNV recall"].mean().round(2)
  31. d = variant_calling_df["SNV precision"].std().round(2)
  32. variant_calling_df["SNV F1 score"] = 2 * variant_calling_df["SNV precision"] * variant_calling_df["SNV recall"] / (variant_calling_df["SNV precision"] + variant_calling_df["SNV recall"])
  33. e = variant_calling_df["SNV F1 score"].mean().round(2)
  34. f = variant_calling_df["SNV F1 score"].std().round(2)
  35. #### indel
  36. a2 = variant_calling_df["INDEL precision"].mean().round(2)
  37. b2 = variant_calling_df["INDEL precision"].std().round(2)
  38. c2 = variant_calling_df["INDEL recall"].mean().round(2)
  39. d2 = variant_calling_df["INDEL precision"].std().round(2)
  40. variant_calling_df["INDEL F1 score"] = 2 * variant_calling_df["INDEL precision"] * variant_calling_df["INDEL recall"] / (variant_calling_df["INDEL precision"] + variant_calling_df["INDEL recall"])
  41. e2 = variant_calling_df["INDEL F1 score"].mean().round(2)
  42. f2 = variant_calling_df["INDEL F1 score"].std().round(2)
  43. data = {'precision':[str(a),str(a2)], 'precision_sd':[str(b),str(b2)], 'recall':[str(c),str(c2)], 'recall_sd':[str(d),str(d2)], 'F1-score':[str(e),str(e2)], 'F1-score_sd':[str(f),str(f2)] }
  44. df = pd.DataFrame(data, index =['SNV', 'INDEL'])
  45. df.to_csv('benchmark_summary.txt',sep="\t")
  46. ### Mendelian
  47. mendelian_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(mendelian_path,'*.txt'))))
  48. mendelian_df.to_csv('mendelian-all.txt',sep="\t",index=0)
  49. ### snv
  50. snv = mendelian_df[mendelian_df['Family'].str.contains("SNV")]
  51. indel = mendelian_df[mendelian_df['Family'].str.contains("INDEL")]
  52. g = snv["Mendelian_Concordance_Rate"].mean().round(2)
  53. h = snv["Mendelian_Concordance_Rate"].std().round(2)
  54. g2 = indel["Mendelian_Concordance_Rate"].mean().round(2)
  55. h2 = indel["Mendelian_Concordance_Rate"].std().round(2)
  56. data = {'MCR':[str(g),str(g2)], 'MCR_sd':[str(h),str(h2)]}
  57. df = pd.DataFrame(data, index =['SNV', 'INDEL'])
  58. df.to_csv('mendelian_summary.txt',sep="\t")