You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

152 lines
9.2KB

  1. import json
  2. import pandas as pd
  3. from functools import reduce
  4. import sys, argparse, os
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt')
  7. parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt')
  8. parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt')
  9. parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt')
  10. parser.add_argument('-hs', '--hs_metrics', type=str, help='*_deduped_hs_metrics.txt')
  11. parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt')
  12. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt')
  13. parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
  14. parser.add_argument('-project', '--project_name', type=str, help='project_name')
  15. args = parser.parse_args()
  16. if args.quality_yield:
  17. # Rename input:
  18. quality_yield_file = args.quality_yield
  19. wgs_metrics_file = args.wgs_metrics
  20. aln_metrics_file = args.aln_metrics
  21. is_metrics_file = args.is_metrics
  22. hs_metrics_file = args.hs_metrics
  23. fastqc_file = args.fastqc
  24. fastqscreen_file = args.fastqscreen
  25. hap_file = args.happy
  26. project_name = args.project_name
  27. #############################################
  28. # fastqc
  29. fastqc = pd.read_table(fastqc_file)
  30. # fastqscreen
  31. dat = pd.read_table(fastqscreen_file)
  32. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  33. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  34. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  35. # pre-alignment
  36. pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
  37. pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
  38. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
  39. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
  40. del pre_alignment_dat['ERCC percentage']
  41. del pre_alignment_dat['Phix percentage']
  42. del pre_alignment_dat['Mouse percentage']
  43. pre_alignment_dat = pre_alignment_dat.round(2)
  44. pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
  45. pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)
  46. ############################
  47. dat = pd.read_table(aln_metrics_file,index_col=False)
  48. dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
  49. aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
  50. aln_metrics = aln_metrics * 100
  51. aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]
  52. dat = pd.read_table(is_metrics_file,index_col=False)
  53. is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
  54. is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]
  55. dat = pd.read_table(quality_yield_file,index_col=False)
  56. dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
  57. dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
  58. quality_yield = dat[['Sample','%Q20','%Q30']]
  59. quality_yield = quality_yield * 100
  60. quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]
  61. dat = pd.read_table(wgs_metrics_file,index_col=False)
  62. wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_20X','PCT_30X']]
  63. wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
  64. wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
  65. wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
  66. wgs_metrics['PCT_20X'] = wgs_metrics['PCT_20X'] * 100
  67. wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
  68. wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]
  69. dat = pd.read_table(hs_metrics_file,index_col=False)
  70. hs_metrics = dat[['Sample','FOLD_80_BASE_PENALTY','PCT_USABLE_BASES_ON_TARGET']]
  71. data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics, hs_metrics]
  72. post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
  73. post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_20X','PCT_30X','Fold-80','On target bases rate']
  74. post_alignment_dat = post_alignment_dat.round(2)
  75. post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)
  76. #########################################
  77. # variants calling
  78. with open(hap_file) as hap_json:
  79. happy = json.load(hap_json)
  80. dat =pd.DataFrame.from_records(happy)
  81. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  82. dat_transposed = dat.T
  83. dat_transposed = dat_transposed.loc[:,['sample_id','TRUTH.FN','QUERY.TOTAL','QUERY.FP','QUERY.UNK','METRIC.Precision','METRIC.Recall','METRIC.F1_Score']]
  84. dat_transposed['QUERY.TP'] = dat_transposed['QUERY.TOTAL'].astype(int) - dat_transposed['QUERY.UNK'].astype(int) - dat_transposed['QUERY.FP'].astype(int)
  85. dat_transposed['QUERY'] =dat_transposed['QUERY.TOTAL'].astype(int) - dat_transposed['QUERY.UNK'].astype(int)
  86. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  87. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  88. indel.reset_index(drop=True, inplace=True)
  89. snv.reset_index(drop=True, inplace=True)
  90. benchmark = pd.concat([snv, indel], axis=1)
  91. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'QUERY','QUERY.TP','QUERY.FP','TRUTH.FN','METRIC.Precision', 'METRIC.Recall','METRIC.F1_Score']]
  92. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']
  93. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']]
  94. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  95. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  96. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  97. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  98. benchmark['SNV F1'] = benchmark['SNV F1'].astype(float)
  99. benchmark['INDEL F1'] = benchmark['INDEL F1'].astype(float)
  100. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  101. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  102. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  103. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  104. benchmark['SNV F1'] = benchmark['SNV F1'] * 100
  105. benchmark['INDEL F1'] = benchmark['INDEL F1'] * 100
  106. benchmark = benchmark.round(2)
  107. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
  108. else:
  109. hap_file = args.happy
  110. with open(hap_file) as hap_json:
  111. happy = json.load(hap_json)
  112. dat =pd.DataFrame.from_records(happy)
  113. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  114. dat_transposed = dat.T
  115. dat_transposed = dat_transposed.loc[:,['sample_id','TRUTH.FN','QUERY.TOTAL','QUERY.FP','QUERY.UNK','METRIC.Precision','METRIC.Recall','METRIC.F1_Score']]
  116. dat_transposed['QUERY.TP'] = dat_transposed['QUERY.TOTAL'].astype(int) - dat_transposed['QUERY.UNK'].astype(int) - dat_transposed['QUERY.FP'].astype(int)
  117. dat_transposed['QUERY'] =dat_transposed['QUERY.TOTAL'].astype(int) - dat_transposed['QUERY.UNK'].astype(int)
  118. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  119. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  120. indel.reset_index(drop=True, inplace=True)
  121. snv.reset_index(drop=True, inplace=True)
  122. benchmark = pd.concat([snv, indel], axis=1)
  123. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'QUERY','QUERY.TP','QUERY.FP','TRUTH.FN','METRIC.Precision', 'METRIC.Recall','METRIC.F1_Score']]
  124. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']
  125. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']]
  126. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  127. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  128. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  129. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  130. benchmark['SNV F1'] = benchmark['SNV F1'].astype(float)
  131. benchmark['INDEL F1'] = benchmark['INDEL F1'].astype(float)
  132. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  133. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  134. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  135. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  136. benchmark['SNV F1'] = benchmark['SNV F1'] * 100
  137. benchmark['INDEL F1'] = benchmark['INDEL F1'] * 100
  138. benchmark = benchmark.round(2)
  139. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)