You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 line
7.8KB

  1. import json
  2. import pandas as pd
  3. from functools import reduce
  4. import sys, argparse, os
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt', required=True)
  7. parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt', required=True)
  8. parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt', required=True)
  9. parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt', required=True)
  10. parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
  11. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
  12. parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
  13. parser.add_argument('-project', '--project_name', type=str, help='project_name', required=True)
  14. args = parser.parse_args()
  15. # Rename input:
  16. quality_yield_file = args.quality_yield
  17. wgs_metrics_file = args.wgs_metrics
  18. aln_metrics_file = args.aln_metrics
  19. is_metrics_file = args.is_metrics
  20. fastqc_file = args.fastqc
  21. fastqscreen_file = args.fastqscreen
  22. hap_file = args.happy
  23. project_name = args.project_name
  24. #############################################
  25. # fastqc
  26. fastqc = pd.read_table(fastqc_file)
  27. #fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
  28. #fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
  29. #fastqc_stat = fastqc.dropna()
  30. # qulimap
  31. #qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
  32. #qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
  33. #qualimap_stat = qualimap.dropna()
  34. # fastqc
  35. #dat = pd.read_table(fastqc_file)
  36. #fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
  37. #fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
  38. #fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])
  39. # fastqscreen
  40. dat = pd.read_table(fastqscreen_file)
  41. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  42. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  43. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  44. # pre-alignment
  45. pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
  46. pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
  47. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
  48. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
  49. del pre_alignment_dat['ERCC percentage']
  50. del pre_alignment_dat['Phix percentage']
  51. del pre_alignment_dat['Mouse percentage']
  52. pre_alignment_dat = pre_alignment_dat.round(2)
  53. pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
  54. pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)
  55. ############################
  56. dat = pd.read_table(aln_metrics_file,index_col=False)
  57. dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
  58. aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
  59. aln_metrics = aln_metrics * 100
  60. aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]
  61. dat = pd.read_table(is_metrics_file,index_col=False)
  62. is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
  63. is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]
  64. dat = pd.read_table(quality_yield_file,index_col=False)
  65. dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
  66. dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
  67. quality_yield = dat[['Sample','%Q20','%Q30']]
  68. quality_yield = quality_yield * 100
  69. quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]
  70. dat = pd.read_table(wgs_metrics_file,index_col=False)
  71. wgs_metrics = dat[['Sample','MEAN_COVERAGE','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']]
  72. wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
  73. wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
  74. wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
  75. wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
  76. wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]
  77. data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics]
  78. post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
  79. post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Mean Coverage' ,'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']
  80. post_alignment_dat = post_alignment_dat.round(2)
  81. post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)
  82. #########################################
  83. # variants calling
  84. with open(hap_file) as hap_json:
  85. happy = json.load(hap_json)
  86. dat =pd.DataFrame.from_records(happy)
  87. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  88. dat_transposed = dat.T
  89. dat_transposed = dat_transposed.loc[:,['sample_id','TRUTH.FN','QUERY.TOTAL','QUERY.FP','QUERY.UNK','METRIC.Precision','METRIC.Recall','METRIC.F1_Score']]
  90. dat_transposed['QUERY.TP'] = dat_transposed['QUERY.TOTAL'].astype(int) - dat_transposed['QUERY.UNK'].astype(int) - dat_transposed['QUERY.FP'].astype(int)
  91. dat_transposed['QUERY'] =dat_transposed['QUERY.TOTAL'].astype(int) - dat_transposed['QUERY.UNK'].astype(int)
  92. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  93. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  94. indel.reset_index(drop=True, inplace=True)
  95. snv.reset_index(drop=True, inplace=True)
  96. benchmark = pd.concat([snv, indel], axis=1)
  97. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'QUERY','QUERY.TP','QUERY.FP','TRUTH.FN','METRIC.Precision', 'METRIC.Recall','METRIC.F1_Score']]
  98. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']
  99. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']]
  100. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  101. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  102. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  103. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  104. benchmark['SNV F1'] = benchmark['SNV F1'].astype(float)
  105. benchmark['INDEL F1'] = benchmark['INDEL F1'].astype(float)
  106. benchmark = benchmark.round(2)
  107. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
  108. #all_rep = [x.split('_')[6] for x in benchmark['Sample']]
  109. #if all_rep.count(all_rep[0]) == 4:
  110. # rep = list(set(all_rep))
  111. # columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD']
  112. # df_ = pd.DataFrame(columns=columns)
  113. # for i in rep:
  114. # string = "_" + i + "_"
  115. # sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')]
  116. # mean = list(sub_dat.mean(axis = 0, skipna = True))
  117. # sd = list(sub_dat.std(axis = 0, skipna = True))
  118. # family_name = project_name + "." + i + ".SNV"
  119. # df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True)
  120. # family_name = project_name + "." + i + ".INDEL"
  121. # df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True)
  122. # df_ = df_.round(2)
  123. # df_.to_csv('precision.recall.txt',sep="\t",index=0)
  124. #else:
  125. # pass