Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

157 lines
7.8KB

  1. import json
  2. import pandas as pd
  3. from functools import reduce
  4. import sys, argparse, os
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt', required=True)
  7. parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt', required=True)
  8. parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt', required=True)
  9. parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt', required=True)
  10. parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
  11. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
  12. parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
  13. parser.add_argument('-project', '--project_name', type=str, help='project_name', required=True)
  14. args = parser.parse_args()
  15. # Rename input:
  16. quality_yield_file = args.quality_yield
  17. wgs_metrics_file = args.wgs_metrics
  18. aln_metrics_file = args.aln_metrics
  19. is_metrics_file = args.is_metrics
  20. fastqc_file = args.fastqc
  21. fastqscreen_file = args.fastqscreen
  22. hap_file = args.happy
  23. project_name = args.project_name
  24. #############################################
  25. # fastqc
  26. fastqc = pd.read_table(fastqc_file)
  27. #fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
  28. #fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
  29. #fastqc_stat = fastqc.dropna()
  30. # qulimap
  31. #qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
  32. #qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
  33. #qualimap_stat = qualimap.dropna()
  34. # fastqc
  35. #dat = pd.read_table(fastqc_file)
  36. #fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
  37. #fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
  38. #fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])
  39. # fastqscreen
  40. dat = pd.read_table(fastqscreen_file)
  41. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  42. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  43. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  44. # pre-alignment
  45. pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
  46. pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
  47. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
  48. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
  49. del pre_alignment_dat['ERCC percentage']
  50. del pre_alignment_dat['Phix percentage']
  51. del pre_alignment_dat['Mouse percentage']
  52. pre_alignment_dat = pre_alignment_dat.round(2)
  53. pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
  54. pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)
  55. ############################
  56. dat = pd.read_table(aln_metrics_file,index_col=False)
  57. dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
  58. aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
  59. aln_metrics = aln_metrics * 100
  60. aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]
  61. dat = pd.read_table(is_metrics_file,index_col=False)
  62. is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
  63. is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]
  64. dat = pd.read_table(quality_yield_file,index_col=False)
  65. dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
  66. dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
  67. quality_yield = dat[['Sample','%Q20','%Q30']]
  68. quality_yield = quality_yield * 100
  69. quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]
  70. dat = pd.read_table(wgs_metrics_file,index_col=False)
  71. wgs_metrics = dat[['Sample','MEAN_COVERAGE','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']]
  72. wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
  73. wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
  74. wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
  75. wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
  76. wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]
  77. data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics]
  78. post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
  79. post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Mean Coverage' ,'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']
  80. post_alignment_dat = post_alignment_dat.round(2)
  81. post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)
  82. #########################################
  83. # variants calling
  84. with open(hap_file) as hap_json:
  85. happy = json.load(hap_json)
  86. dat =pd.DataFrame.from_records(happy)
  87. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  88. dat_transposed = dat.T
  89. dat_transposed = dat_transposed.loc[:,['sample_id','TRUTH.FN','QUERY.TOTAL','QUERY.FP','QUERY.UNK','METRIC.Precision','METRIC.Recall','METRIC.F1_Score']]
  90. dat_transposed['QUERY.TP'] = dat_transposed['QUERY.TOTAL'].astype(int) - dat_transposed['QUERY.UNK'].astype(int) - dat_transposed['QUERY.FP'].astype(int)
  91. dat_transposed['QUERY'] =dat_transposed['QUERY.TOTAL'].astype(int) - dat_transposed['QUERY.UNK'].astype(int)
  92. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  93. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  94. indel.reset_index(drop=True, inplace=True)
  95. snv.reset_index(drop=True, inplace=True)
  96. benchmark = pd.concat([snv, indel], axis=1)
  97. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'QUERY','QUERY.TP','QUERY.FP','TRUTH.FN','METRIC.Precision', 'METRIC.Recall','METRIC.F1_Score']]
  98. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']
  99. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']]
  100. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  101. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  102. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  103. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  104. benchmark['SNV F1'] = benchmark['SNV F1'].astype(float)
  105. benchmark['INDEL F1'] = benchmark['INDEL F1'].astype(float)
  106. benchmark = benchmark.round(2)
  107. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
  108. #all_rep = [x.split('_')[6] for x in benchmark['Sample']]
  109. #if all_rep.count(all_rep[0]) == 4:
  110. # rep = list(set(all_rep))
  111. # columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD']
  112. # df_ = pd.DataFrame(columns=columns)
  113. # for i in rep:
  114. # string = "_" + i + "_"
  115. # sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')]
  116. # mean = list(sub_dat.mean(axis = 0, skipna = True))
  117. # sd = list(sub_dat.std(axis = 0, skipna = True))
  118. # family_name = project_name + "." + i + ".SNV"
  119. # df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True)
  120. # family_name = project_name + "." + i + ".INDEL"
  121. # df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True)
  122. # df_ = df_.round(2)
  123. # df_.to_csv('precision.recall.txt',sep="\t",index=0)
  124. #else:
  125. # pass