You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

134 lines
7.2KB

  1. import json
  2. import pandas as pd
  3. from functools import reduce
  4. import sys, argparse, os
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt')
  7. parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt')
  8. parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt')
  9. parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt')
  10. parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt')
  11. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt')
  12. parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
  13. parser.add_argument('-project', '--project_name', type=str, help='project_name')
  14. args = parser.parse_args()
  15. if args.quality_yield:
  16. # Rename input:
  17. quality_yield_file = args.quality_yield
  18. wgs_metrics_file = args.wgs_metrics
  19. aln_metrics_file = args.aln_metrics
  20. is_metrics_file = args.is_metrics
  21. fastqc_file = args.fastqc
  22. fastqscreen_file = args.fastqscreen
  23. hap_file = args.happy
  24. project_name = args.project_name
  25. #############################################
  26. # fastqc
  27. fastqc = pd.read_table(fastqc_file)
  28. # fastqscreen
  29. dat = pd.read_table(fastqscreen_file)
  30. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  31. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  32. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  33. # pre-alignment
  34. pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
  35. pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
  36. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
  37. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
  38. del pre_alignment_dat['ERCC percentage']
  39. del pre_alignment_dat['Phix percentage']
  40. del pre_alignment_dat['Mouse percentage']
  41. pre_alignment_dat = pre_alignment_dat.round(2)
  42. pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
  43. pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)
  44. ############################
  45. dat = pd.read_table(aln_metrics_file,index_col=False)
  46. dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
  47. aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
  48. aln_metrics = aln_metrics * 100
  49. aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]
  50. dat = pd.read_table(is_metrics_file,index_col=False)
  51. is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
  52. is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]
  53. dat = pd.read_table(quality_yield_file,index_col=False)
  54. dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
  55. dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
  56. quality_yield = dat[['Sample','%Q20','%Q30']]
  57. quality_yield = quality_yield * 100
  58. quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]
  59. dat = pd.read_table(wgs_metrics_file,index_col=False)
  60. wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']]
  61. wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
  62. wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
  63. wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
  64. wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
  65. wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]
  66. data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics]
  67. post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
  68. post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']
  69. post_alignment_dat = post_alignment_dat.round(2)
  70. post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)
  71. #########################################
  72. # variants calling
  73. with open(hap_file) as hap_json:
  74. happy = json.load(hap_json)
  75. dat =pd.DataFrame.from_records(happy)
  76. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  77. dat_transposed = dat.T
  78. dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  79. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  80. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  81. indel.reset_index(drop=True, inplace=True)
  82. snv.reset_index(drop=True, inplace=True)
  83. benchmark = pd.concat([snv, indel], axis=1)
  84. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  85. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  86. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  87. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  88. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  89. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  90. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  91. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  92. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  93. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  94. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  95. benchmark = benchmark.round(2)
  96. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
  97. else:
  98. hap_file = args.happy
  99. with open(hap_file) as hap_json:
  100. happy = json.load(hap_json)
  101. dat =pd.DataFrame.from_records(happy)
  102. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  103. dat_transposed = dat.T
  104. dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  105. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  106. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  107. indel.reset_index(drop=True, inplace=True)
  108. snv.reset_index(drop=True, inplace=True)
  109. benchmark = pd.concat([snv, indel], axis=1)
  110. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  111. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  112. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  113. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  114. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  115. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  116. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  117. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  118. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  119. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  120. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  121. benchmark = benchmark.round(2)
  122. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)