You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
4.3KB

  1. import json
  2. import pandas as pd
  3. import sys, argparse, os
  4. import statistics
  5. parser = argparse.ArgumentParser(description="This script is to summary information for pre-alignment QC")
  6. parser.add_argument('-general', '--general_stat', type=str, help='multiqc_general_stats.txt', required=True)
  7. parser.add_argument('-is', '--is_metrics', type=str, help='_is_metrics.txt', required=True)
  8. parser.add_argument('-wgsmetrics', '--WgsMetricsAlgo', type=str, help='deduped_WgsMetricsAlgo', required=True)
  9. parser.add_argument('-qualityyield', '--QualityYield', type=str, help='deduped_QualityYield', required=True)
  10. parser.add_argument('-aln', '--aln_metrics', type=str, help='aln_metrics.txt', required=True)
  11. args = parser.parse_args()
  12. general_file = args.general_stat
  13. is_file = args.is_metrics
  14. wgsmetrics_file = args.wgsmetrics
  15. qualityyield_file = args.qualityyield
  16. aln_file = args.aln_metrics
  17. ##### Table
  18. ## general stat: % GC
  19. dat = pd.read_table(general_file)
  20. qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
  21. qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
  22. qualimap_stat = qualimap.dropna()
  23. part1 = fastqc_stat.loc[:,['Sample', 'FastQC_mqc-generalstats-fastqc-percent_duplicates','FastQC_mqc-generalstats-fastqc-total_sequences']]
  24. ## is_metrics: median insert size
  25. ## deduped_WgsMetricsAlgo: 1x, 5x, 10x, 30x, median coverage
  26. with open(html_file) as file:
  27. origDict = json.load(file)
  28. newdict = {(k1, k2):v2 for k1,v1 in origDict.items() \
  29. for k2,v2 in origDict[k1].items()}
  30. df = pd.DataFrame([newdict[i] for i in sorted(newdict)],
  31. index=pd.MultiIndex.from_tuples([i for i in sorted(newdict.keys())]))
  32. gc = []
  33. at = []
  34. for i in part1['Sample']:
  35. sub_df = df.loc[i,:]
  36. gc.append(statistics.mean(sub_df['g']/sub_df['c']))
  37. at.append(statistics.mean(sub_df['a']/sub_df['t']))
  38. ## fastq_screen
  39. dat = pd.read_table(fastqscreen_file)
  40. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  41. del fastqscreen['ERCC percentage']
  42. del fastqscreen['Phix percentage']
  43. ### merge all information
  44. part1.insert(loc=3, column='G/C ratio', value=gc)
  45. part1.insert(loc=4, column='A/T ratio', value=at)
  46. part1.reset_index(drop=True, inplace=True)
  47. fastqscreen.reset_index(drop=True, inplace=True)
  48. df = pd.concat([part1, fastqscreen], axis=1)
  49. df = df.append(df.mean(axis=0),ignore_index=True)
  50. df = df.fillna('Batch average value')
  51. df.columns = ['Sample','Total sequences (million)','% Dup','G/C ratio','A/T ratio','% Human','% EColi','% Adapter' , '% Vector','% rRNA' , '% Virus','% Yeast' ,'% Mitoch' ,'% No hits']
  52. df.to_csv('per-alignment_table_summary.txt',sep='\t',index=False)
  53. ##### Picture
  54. ## cumulative genome coverage
  55. with open(json_file) as file:
  56. all_dat = json.load(file)
  57. genome_coverage_json = all_dat['report_plot_data']['qualimap_genome_fraction']['datasets'][0]
  58. dat =pd.DataFrame.from_records(genome_coverage_json)
  59. genome_coverage = pd.DataFrame(index=pd.DataFrame(dat.loc[0,'data'])[0])
  60. for i in range(dat.shape[0]):
  61. one_sample = pd.DataFrame(dat.loc[i,'data'])
  62. one_sample.index = one_sample[0]
  63. genome_coverage[dat.loc[i,'name']] = one_sample[1]
  64. genome_coverage = genome_coverage.transpose()
  65. genome_coverage['Sample'] = genome_coverage.index
  66. genome_coverage.to_csv('post-alignment_genome_coverage.txt',sep='\t',index=False)
  67. ## insert size histogram
  68. insert_size_json = all_dat['report_plot_data']['qualimap_insert_size']['datasets'][0]
  69. dat =pd.DataFrame.from_records(insert_size_json)
  70. insert_size = pd.DataFrame(index=pd.DataFrame(dat.loc[0,'data'])[0])
  71. for i in range(dat.shape[0]):
  72. one_sample = pd.DataFrame(dat.loc[i,'data'])
  73. one_sample.index = one_sample[0]
  74. insert_size[dat.loc[i,'name']] = one_sample[1]
  75. insert_size = insert_size.transpose()
  76. insert_size['Sample'] = insert_size.index
  77. insert_size.to_csv('post-alignment_insert_size.txt',sep='\t',index=False)
  78. ## GC content distribution
  79. gc_content_json = all_dat['report_plot_data']['qualimap_gc_content']['datasets'][0]
  80. dat =pd.DataFrame.from_records(gc_content_json)
  81. gc_content = pd.DataFrame(index=pd.DataFrame(dat.loc[0,'data'])[0])
  82. for i in range(dat.shape[0]):
  83. one_sample = pd.DataFrame(dat.loc[i,'data'])
  84. one_sample.index = one_sample[0]
  85. gc_content[dat.loc[i,'name']] = one_sample[1]
  86. gc_content = gc_content.transpose()
  87. gc_content['Sample'] = gc_content.index
  88. gc_content.to_csv('post-alignment_gc_content.txt',sep='\t',index=False)