您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

extract_tables.py 7.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. import json
  2. import pandas as pd
  3. from functools import reduce
  4. import sys, argparse, os
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt')
  7. parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt')
  8. parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt')
  9. parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt')
  10. parser.add_argument('-hs', '--hs_metrics', type=str, help='*_deduped_hs_metrics.txt')
  11. parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt')
  12. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt')
  13. parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
  14. parser.add_argument('-project', '--project_name', type=str, help='project_name')
  15. args = parser.parse_args()
  16. if args.quality_yield:
  17. # Rename input:
  18. quality_yield_file = args.quality_yield
  19. wgs_metrics_file = args.wgs_metrics
  20. aln_metrics_file = args.aln_metrics
  21. is_metrics_file = args.is_metrics
  22. fastqc_file = args.fastqc
  23. fastqscreen_file = args.fastqscreen
  24. hap_file = args.happy
  25. project_name = args.project_name
  26. #############################################
  27. # fastqc
  28. fastqc = pd.read_table(fastqc_file)
  29. # fastqscreen
  30. dat = pd.read_table(fastqscreen_file)
  31. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  32. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  33. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  34. # pre-alignment
  35. pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
  36. pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
  37. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
  38. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
  39. del pre_alignment_dat['ERCC percentage']
  40. del pre_alignment_dat['Phix percentage']
  41. del pre_alignment_dat['Mouse percentage']
  42. pre_alignment_dat = pre_alignment_dat.round(2)
  43. pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
  44. pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)
  45. ############################
  46. dat = pd.read_table(aln_metrics_file,index_col=False)
  47. dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
  48. aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
  49. aln_metrics = aln_metrics * 100
  50. aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]
  51. dat = pd.read_table(is_metrics_file,index_col=False)
  52. is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
  53. is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]
  54. dat = pd.read_table(quality_yield_file,index_col=False)
  55. dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
  56. dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
  57. quality_yield = dat[['Sample','%Q20','%Q30']]
  58. quality_yield = quality_yield * 100
  59. quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]
  60. dat = pd.read_table(wgs_metrics_file,index_col=False)
  61. wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_20X','PCT_30X']]
  62. wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
  63. wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
  64. wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
  65. wgs_metrics['PCT_20X'] = wgs_metrics['PCT_20X'] * 100
  66. wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
  67. wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]
  68. hs_metrics = dat[['Sample','FOLD_80_BASE_PENALTY','PCT_USABLE_BASES_ON_TARGET']]
  69. data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics, hs_metrics]
  70. post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
  71. post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_20X','PCT_30X','Fold-80','On target bases rate']
  72. post_alignment_dat = post_alignment_dat.round(2)
  73. post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)
  74. #########################################
  75. # variants calling
  76. with open(hap_file) as hap_json:
  77. happy = json.load(hap_json)
  78. dat =pd.DataFrame.from_records(happy)
  79. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  80. dat_transposed = dat.T
  81. dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  82. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  83. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  84. indel.reset_index(drop=True, inplace=True)
  85. snv.reset_index(drop=True, inplace=True)
  86. benchmark = pd.concat([snv, indel], axis=1)
  87. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  88. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  89. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  90. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  91. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  92. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  93. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  94. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  95. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  96. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  97. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  98. benchmark = benchmark.round(2)
  99. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
  100. else:
  101. hap_file = args.happy
  102. with open(hap_file) as hap_json:
  103. happy = json.load(hap_json)
  104. dat =pd.DataFrame.from_records(happy)
  105. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  106. dat_transposed = dat.T
  107. dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  108. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  109. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  110. indel.reset_index(drop=True, inplace=True)
  111. snv.reset_index(drop=True, inplace=True)
  112. benchmark = pd.concat([snv, indel], axis=1)
  113. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  114. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  115. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  116. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  117. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  118. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  119. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  120. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  121. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  122. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  123. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  124. benchmark = benchmark.round(2)
  125. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)