You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
5.2KB

  1. import json
  2. import pandas as pd
  3. import sys, argparse, os
  4. import statistics
  5. parser = argparse.ArgumentParser(description="This script is to summary information for pre-alignment QC")
  6. parser.add_argument('-general', '--general_stat', type=str, help='multiqc_general_stats.txt', required=True)
  7. parser.add_argument('-html', '--html', type=str, help='multiqc_report.html', required=True)
  8. parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
  9. parser.add_argument('-json', '--json', type=str, help='multiqc_happy_data.json', required=True)
  10. args = parser.parse_args()
  11. general_file = args.general_stat
  12. html_file = args.html
  13. fastqscreen_file = args.fastqscreen
  14. json_file = args.json
  15. ##### Table
  16. ## general stat: 1. Total sequences; 2. %Dup
  17. dat = pd.read_table(general_file)
  18. fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
  19. fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
  20. fastqc_stat = fastqc.dropna()
  21. part1 = fastqc_stat.loc[:,['Sample', 'FastQC_mqc-generalstats-fastqc-percent_duplicates','FastQC_mqc-generalstats-fastqc-total_sequences']]
  22. ## report html: 1. G/C ratio; 2. A/T ratio
  23. ## cat multiqc_report.html | grep 'fastqc_seq_content_data = ' | sed s'/fastqc_seq_content_data\ =\ //g' | sed 's/^[ \t]*//g' | sed s'/;//g' > fastqc_sequence_content.json
  24. with open(html_file) as file:
  25. origDict = json.load(file)
  26. newdict = {(k1, k2):v2 for k1,v1 in origDict.items() \
  27. for k2,v2 in origDict[k1].items()}
  28. df = pd.DataFrame([newdict[i] for i in sorted(newdict)],
  29. index=pd.MultiIndex.from_tuples([i for i in sorted(newdict.keys())]))
  30. gc = []
  31. at = []
  32. for i in part1['Sample']:
  33. sub_df = df.loc[i,:]
  34. gc.append(statistics.mean(sub_df['g']/sub_df['c']))
  35. at.append(statistics.mean(sub_df['a']/sub_df['t']))
  36. ## fastq_screen
  37. dat = pd.read_table(fastqscreen_file)
  38. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  39. del fastqscreen['ERCC percentage']
  40. del fastqscreen['Phix percentage']
  41. ### merge all information
  42. part1.insert(loc=3, column='G/C ratio', value=gc)
  43. part1.insert(loc=4, column='A/T ratio', value=at)
  44. part1.reset_index(drop=True, inplace=True)
  45. fastqscreen.reset_index(drop=True, inplace=True)
  46. df = pd.concat([part1, fastqscreen], axis=1)
  47. df = df.append(df.mean(axis=0),ignore_index=True)
  48. df = df.fillna('Batch average value')
  49. df.columns = ['Sample','Total sequences (million)','% Dup','G/C ratio','A/T ratio','% Human','% EColi','% Adapter' , '% Vector','% rRNA' , '% Virus','% Yeast' ,'% Mitoch' ,'% No hits']
  50. df.to_csv('per-alignment_table_summary.txt',sep='\t',index=False)
  51. ##### Picture
  52. ## mean quality scores
  53. with open(json_file) as file:
  54. all_dat = json.load(file)
  55. mean_quality_json = all_dat['report_plot_data']['fastqc_per_base_sequence_quality_plot']['datasets'][0]
  56. dat =pd.DataFrame.from_records(mean_quality_json)
  57. mean_quality = pd.DataFrame(index=pd.DataFrame(dat.loc[0,'data'])[0])
  58. for i in range(dat.shape[0]):
  59. one_sample = pd.DataFrame(dat.loc[i,'data'])
  60. one_sample.index = one_sample[0]
  61. mean_quality[dat.loc[i,'name']] = one_sample[1]
  62. mean_quality = mean_quality.transpose()
  63. mean_quality['Sample'] = mean_quality.index
  64. mean_quality.to_csv('pre-alignment_mean_quality.txt',sep='\t',index=False)
  65. ## per sequence GC content
  66. gc_content_json = all_dat['report_plot_data']['fastqc_per_sequence_gc_content_plot']['datasets'][0]
  67. dat =pd.DataFrame.from_records(gc_content_json)
  68. gc_content = pd.DataFrame(index=pd.DataFrame(dat.loc[0,'data'])[0])
  69. for i in range(dat.shape[0]):
  70. one_sample = pd.DataFrame(dat.loc[i,'data'])
  71. one_sample.index = one_sample[0]
  72. gc_content[dat.loc[i,'name']] = one_sample[1]
  73. gc_content = gc_content.transpose()
  74. gc_content['Sample'] = gc_content.index
  75. gc_content.to_csv('pre-alignment_gc_content.txt',sep='\t',index=False)
  76. # fastqc and qualimap
  77. dat = pd.read_table(fastqc_qualimap_file)
  78. fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
  79. fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
  80. fastqc_stat = fastqc.dropna()
  81. # qulimap
  82. qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
  83. qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
  84. qualimap_stat = qualimap.dropna()
  85. # fastqc
  86. dat = pd.read_table(fastqc_file)
  87. fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
  88. fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
  89. fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])
  90. # fastqscreen
  91. dat = pd.read_table(fastqscreen_file)
  92. fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
  93. dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
  94. fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])
  95. # benchmark
  96. with open(hap_file) as hap_json:
  97. happy = json.load(hap_json)
  98. dat =pd.DataFrame.from_records(happy)
  99. dat = dat.loc[:, dat.columns.str.endswith('ALL')]
  100. dat_transposed = dat.T
  101. benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
  102. benchmark['sample_id'] = benchmark.index
  103. benchmark.columns = ['Sample','Precision','Recall']
  104. #output
  105. fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
  106. fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
  107. qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
  108. benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)