Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

134 Zeilen
7.2KB

  1. import json
  2. import pandas as pd
  3. from functools import reduce
  4. import sys, argparse, os
  5. parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
  6. parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt')
  7. parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt')
  8. parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt')
  9. parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt')
  10. parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt')
  11. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt')
  12. parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
  13. parser.add_argument('-project', '--project_name', type=str, help='project_name')
  14. args = parser.parse_args()
  15. if args.quality_yield:
  16. # Rename input:
  17. quality_yield_file = args.quality_yield
  18. wgs_metrics_file = args.wgs_metrics
  19. aln_metrics_file = args.aln_metrics
  20. is_metrics_file = args.is_metrics
  21. fastqc_file = args.fastqc
  22. fastqscreen_file = args.fastqscreen
  23. hap_file = args.happy
  24. project_name = args.project_name
  25. #############################################
  26. # fastqc
  27. fastqc = pd.read_table(fastqc_file)
  28. # fastqscreen
  29. dat = pd.read_table(fastqscreen_file)
  30. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  31. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  32. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  33. # pre-alignment
  34. pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
  35. pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
  36. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
  37. del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
  38. del pre_alignment_dat['ERCC percentage']
  39. del pre_alignment_dat['Phix percentage']
  40. del pre_alignment_dat['Mouse percentage']
  41. pre_alignment_dat = pre_alignment_dat.round(2)
  42. pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
  43. pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)
  44. ############################
  45. dat = pd.read_table(aln_metrics_file,index_col=False)
  46. dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
  47. aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
  48. aln_metrics = aln_metrics * 100
  49. aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]
  50. dat = pd.read_table(is_metrics_file,index_col=False)
  51. is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
  52. is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]
  53. dat = pd.read_table(quality_yield_file,index_col=False)
  54. dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
  55. dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
  56. quality_yield = dat[['Sample','%Q20','%Q30']]
  57. quality_yield = quality_yield * 100
  58. quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]
  59. dat = pd.read_table(wgs_metrics_file,index_col=False)
  60. wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']]
  61. wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
  62. wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
  63. wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
  64. wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
  65. wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]
  66. data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics]
  67. post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
  68. post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']
  69. post_alignment_dat = post_alignment_dat.round(2)
  70. post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)
  71. #########################################
  72. # variants calling
  73. with open(hap_file) as hap_json:
  74. happy = json.load(hap_json)
  75. dat =pd.DataFrame.from_records(happy)
  76. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  77. dat_transposed = dat.T
  78. dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  79. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  80. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  81. indel.reset_index(drop=True, inplace=True)
  82. snv.reset_index(drop=True, inplace=True)
  83. benchmark = pd.concat([snv, indel], axis=1)
  84. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  85. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  86. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  87. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  88. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  89. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  90. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  91. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  92. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  93. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  94. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  95. benchmark = benchmark.round(2)
  96. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
  97. else:
  98. hap_file = args.happy
  99. with open(hap_file) as hap_json:
  100. happy = json.load(hap_json)
  101. dat =pd.DataFrame.from_records(happy)
  102. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  103. dat_transposed = dat.T
  104. dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
  105. indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
  106. snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
  107. indel.reset_index(drop=True, inplace=True)
  108. snv.reset_index(drop=True, inplace=True)
  109. benchmark = pd.concat([snv, indel], axis=1)
  110. benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
  111. benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
  112. benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
  113. benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
  114. benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
  115. benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
  116. benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
  117. benchmark['SNV precision'] = benchmark['SNV precision'] * 100
  118. benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
  119. benchmark['SNV recall'] = benchmark['SNV recall'] * 100
  120. benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
  121. benchmark = benchmark.round(2)
  122. benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)