You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

138 lines
7.5KB

  1. import json
  2. import pandas as pd
  3. from functools import reduce
  4. import sys, argparse, os
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt')
  7. parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt')
  8. parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt')
  9. parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt')
  10. parser.add_argument('-hs', '--hs_metrics', type=str, help='*_deduped_hs_metrics.txt')
  11. parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt')
  12. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt')
  13. parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
  14. parser.add_argument('-project', '--project_name', type=str, help='project_name')
  15. args = parser.parse_args()
  16. if args.quality_yield:
  17. # Rename input:
  18. quality_yield_file = args.quality_yield
  19. wgs_metrics_file = args.wgs_metrics
  20. aln_metrics_file = args.aln_metrics
  21. is_metrics_file = args.is_metrics
  22. fastqc_file = args.fastqc
  23. fastqscreen_file = args.fastqscreen
  24. hap_file = args.happy
  25. project_name = args.project_name
  26. #############################################
  27. # fastqc
  28. fastqc = pd.read_table(fastqc_file)
  29. # fastqscreen
  30. dat = pd.read_table(fastqscreen_file)
  31. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  32. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  33. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  34. # pre-alignment
  35. pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
  36. pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
  37. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
  38. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
  39. del pre_alignment_dat['ERCC percentage']
  40. del pre_alignment_dat['Phix percentage']
  41. del pre_alignment_dat['Mouse percentage']
  42. pre_alignment_dat = pre_alignment_dat.round(2)
  43. pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
  44. pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)
  45. ############################
  46. dat = pd.read_table(aln_metrics_file,index_col=False)
  47. dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
  48. aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
  49. aln_metrics = aln_metrics * 100
  50. aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]
  51. dat = pd.read_table(is_metrics_file,index_col=False)
  52. is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
  53. is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]
  54. dat = pd.read_table(quality_yield_file,index_col=False)
  55. dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
  56. dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
  57. quality_yield = dat[['Sample','%Q20','%Q30']]
  58. quality_yield = quality_yield * 100
  59. quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]
  60. dat = pd.read_table(wgs_metrics_file,index_col=False)
  61. wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_20X','PCT_30X']]
  62. wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
  63. wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
  64. wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
  65. wgs_metrics['PCT_20X'] = wgs_metrics['PCT_20X'] * 100
  66. wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
  67. wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]
  68. hs_metrics = dat[['Sample','FOLD_80_BASE_PENALTY','PCT_USABLE_BASES_ON_TARGET']]
  69. data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics, hs_metrics]
  70. post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
  71. post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_20X','PCT_30X','Fold-80','On target bases rate']
  72. post_alignment_dat = post_alignment_dat.round(2)
  73. post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)
  74. #########################################
  75. # variants calling
  76. with open(hap_file) as hap_json:
  77. happy = json.load(hap_json)
  78. dat =pd.DataFrame.from_records(happy)
  79. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  80. dat_transposed = dat.T
  81. dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  82. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  83. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  84. indel.reset_index(drop=True, inplace=True)
  85. snv.reset_index(drop=True, inplace=True)
  86. benchmark = pd.concat([snv, indel], axis=1)
  87. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  88. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  89. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  90. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  91. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  92. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  93. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  94. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  95. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  96. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  97. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  98. benchmark = benchmark.round(2)
  99. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
  100. else:
  101. hap_file = args.happy
  102. with open(hap_file) as hap_json:
  103. happy = json.load(hap_json)
  104. dat =pd.DataFrame.from_records(happy)
  105. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  106. dat_transposed = dat.T
  107. dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  108. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  109. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  110. indel.reset_index(drop=True, inplace=True)
  111. snv.reset_index(drop=True, inplace=True)
  112. benchmark = pd.concat([snv, indel], axis=1)
  113. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  114. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  115. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  116. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  117. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  118. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  119. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  120. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  121. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  122. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  123. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  124. benchmark = benchmark.round(2)
  125. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)