You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
7.3KB

  1. import json
  2. import pandas as pd
  3. from functools import reduce
  4. import sys, argparse, os
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt', required=True)
  7. parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt', required=True)
  8. parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt', required=True)
  9. parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt', required=True)
  10. parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
  11. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
  12. #parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
  13. parser.add_argument('-project', '--project_name', type=str, help='project_name', required=True)
  14. args = parser.parse_args()
  15. # Rename input:
  16. quality_yield_file = args.quality_yield
  17. wgs_metrics_file = args.wgs_metrics
  18. aln_metrics_file = args.aln_metrics
  19. is_metrics_file = args.is_metrics
  20. fastqc_file = args.fastqc
  21. fastqscreen_file = args.fastqscreen
  22. #hap_file = args.happy
  23. project_name = args.project_name
  24. #############################################
  25. # fastqc
  26. fastqc = pd.read_table(fastqc_file)
  27. #fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
  28. #fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
  29. #fastqc_stat = fastqc.dropna()
  30. # qulimap
  31. #qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
  32. #qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
  33. #qualimap_stat = qualimap.dropna()
  34. # fastqc
  35. #dat = pd.read_table(fastqc_file)
  36. #fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
  37. #fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
  38. #fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])
  39. # fastqscreen
  40. dat = pd.read_table(fastqscreen_file)
  41. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  42. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  43. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  44. # pre-alignment
  45. pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
  46. pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
  47. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
  48. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
  49. del pre_alignment_dat['ERCC percentage']
  50. del pre_alignment_dat['Phix percentage']
  51. del pre_alignment_dat['Mouse percentage']
  52. pre_alignment_dat = pre_alignment_dat.round(2)
  53. pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
  54. pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)
  55. ############################
  56. dat = pd.read_table(aln_metrics_file,index_col=False)
  57. dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
  58. aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
  59. aln_metrics = aln_metrics * 100
  60. aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]
  61. dat = pd.read_table(is_metrics_file,index_col=False)
  62. is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
  63. is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]
  64. dat = pd.read_table(quality_yield_file,index_col=False)
  65. dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
  66. dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
  67. quality_yield = dat[['Sample','%Q20','%Q30']]
  68. quality_yield = quality_yield * 100
  69. quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]
  70. dat = pd.read_table(wgs_metrics_file,index_col=False)
  71. wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']]
  72. wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
  73. wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
  74. wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
  75. wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
  76. wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]
  77. data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics]
  78. post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
  79. post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']
  80. post_alignment_dat = post_alignment_dat.round(2)
  81. post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)
  82. #########################################
  83. # variants calling
  84. #with open(hap_file) as hap_json:
  85. # happy = json.load(hap_json)
  86. #dat =pd.DataFrame.from_records(happy)
  87. #dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  88. #dat_transposed = dat.T
  89. #dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  90. #indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  91. #snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  92. #indel.reset_index(drop=True, inplace=True)
  93. #snv.reset_index(drop=True, inplace=True)
  94. #benchmark = pd.concat([snv, indel], axis=1)
  95. #benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  96. #benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  97. #benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  98. #benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  99. #benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  100. #benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  101. #benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  102. #benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  103. #benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  104. #benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  105. #benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  106. #benchmark = benchmark.round(2)
  107. #benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
  108. #all_rep = [x.split('_')[6] for x in benchmark['Sample']]
  109. #if all_rep.count(all_rep[0]) == 4:
  110. # rep = list(set(all_rep))
  111. # columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD']
  112. # df_ = pd.DataFrame(columns=columns)
  113. # for i in rep:
  114. # string = "_" + i + "_"
  115. # sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')]
  116. # mean = list(sub_dat.mean(axis = 0, skipna = True))
  117. # sd = list(sub_dat.std(axis = 0, skipna = True))
  118. # family_name = project_name + "." + i + ".SNV"
  119. # df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True)
  120. # family_name = project_name + "." + i + ".INDEL"
  121. # df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True)
  122. # df_ = df_.round(2)
  123. # df_.to_csv('precision.recall.txt',sep="\t",index=0)
  124. #else:
  125. # pass