Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

103 Zeilen
3.4KB

  1. import pandas as pd
  2. from functools import reduce
  3. import sys, argparse, os
  4. import glob
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-pre', '--pre_alignment_path', type=str, help='pre-alignment directory')
  7. parser.add_argument('-post', '--post_alignment_path', type=str, help='post-alignment directory')
  8. parser.add_argument('-vcf', '--benchmark_path', type=str, help='benchmark directory')
  9. parser.add_argument('-men', '--mendelian_path', type=str, help='mendelian directory')
  10. args = parser.parse_args()
  11. # Rename input:
  12. pre_alignment_path = args.pre_alignment_path
  13. post_alignment_path = args.post_alignment_path
  14. benchmark_path = args.benchmark_path
  15. mendelian_path = args.mendelian_path
  16. ###pre-alignment
  17. pre_alignment_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(pre_alignment_path,'*.txt'))))
  18. pre_alignment_df.to_csv('pre-alignment-all.txt',sep="\t",index=0)
  19. ###post-alignment
  20. post_alignment_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(post_alignment_path,'*.txt'))))
  21. post_alignment_df.to_csv('post-alignment-all.txt',sep="\t",index=0)
  22. ###variant-calling qc
  23. ## detail
  24. variant_calling_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(benchmark_path,'*.txt'))))
  25. variant_calling_df.to_csv('variant-calling-all.txt',sep="\t",index=0)
  26. ## mean + sd
  27. ####snv
  28. a = variant_calling_df["SNV precision"].mean().round(2)
  29. b = variant_calling_df["SNV precision"].std().round(2)
  30. c = variant_calling_df["SNV recall"].mean().round(2)
  31. d = variant_calling_df["SNV precision"].std().round(2)
  32. variant_calling_df["SNV F1 score"] = 2 * variant_calling_df["SNV precision"] * variant_calling_df["SNV recall"] / (variant_calling_df["SNV precision"] + variant_calling_df["SNV recall"])
  33. e = variant_calling_df["SNV F1 score"].mean().round(2)
  34. f = variant_calling_df["SNV F1 score"].std().round(2)
  35. #### indel
  36. a2 = variant_calling_df["INDEL precision"].mean().round(2)
  37. b2 = variant_calling_df["INDEL precision"].std().round(2)
  38. c2 = variant_calling_df["INDEL recall"].mean().round(2)
  39. d2 = variant_calling_df["INDEL precision"].std().round(2)
  40. variant_calling_df["INDEL F1 score"] = 2 * variant_calling_df["INDEL precision"] * variant_calling_df["INDEL recall"] / (variant_calling_df["INDEL precision"] + variant_calling_df["INDEL recall"])
  41. e2 = variant_calling_df["INDEL F1 score"].mean().round(2)
  42. f2 = variant_calling_df["INDEL F1 score"].std().round(2)
  43. data = {'precision':[str(a),str(a2)], 'precision_sd':[str(b),str(b2)], 'recall':[str(c),str(c2)], 'recall_sd':[str(d),str(d2)], 'F1-score':[str(e),str(e2)], 'F1-score_sd':[str(f),str(f2)] }
  44. df = pd.DataFrame(data, index =['SNV', 'INDEL'])
  45. df.to_csv('benchmark_summary.txt',sep="\t")
  46. ### Mendelian
  47. mendelian_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(mendelian_path,'*.txt'))))
  48. mendelian_df.to_csv('mendelian-all.txt',sep="\t",index=0)
  49. ### snv
  50. snv = mendelian_df[mendelian_df['Family'].str.contains("SNV")]
  51. indel = mendelian_df[mendelian_df['Family'].str.contains("INDEL")]
  52. g = snv["Mendelian_Concordance_Rate"].mean().round(2)
  53. h = snv["Mendelian_Concordance_Rate"].std().round(2)
  54. g2 = indel["Mendelian_Concordance_Rate"].mean().round(2)
  55. h2 = indel["Mendelian_Concordance_Rate"].std().round(2)
  56. data = {'MCR':[str(g),str(g2)], 'MCR_sd':[str(h),str(h2)]}
  57. df = pd.DataFrame(data, index =['SNV', 'INDEL'])
  58. df.to_csv('mendelian_summary.txt',sep="\t")