|
|
@@ -1,5 +1,6 @@ |
|
|
|
import json |
|
|
|
import pandas as pd |
|
|
|
from functools import reduce |
|
|
|
import sys, argparse, os |
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics") |
|
|
@@ -14,6 +15,8 @@ parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', |
|
|
|
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) |
|
|
|
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) |
|
|
|
|
|
|
|
parser.add_argument('-project', '--project_name', type=str, help='project_name', required=True) |
|
|
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
# Rename input: |
|
|
@@ -26,6 +29,8 @@ fastqc_file = args.fastqc |
|
|
|
fastqscreen_file = args.fastqscreen |
|
|
|
hap_file = args.happy |
|
|
|
|
|
|
|
project_name = args.project_name |
|
|
|
|
|
|
|
############################################# |
|
|
|
# fastqc |
|
|
|
fastqc = pd.read_table(fastqc_file) |
|
|
@@ -67,45 +72,81 @@ pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0) |
|
|
|
|
|
|
|
|
|
|
|
############################ |
|
|
|
dat = pd.read_table(aln_metrics_file) |
|
|
|
dat = pd.read_table(aln_metrics_file,index_col=False) |
|
|
|
dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"] |
|
|
|
aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]] |
|
|
|
aln_metrics = aln_metrics * 100 |
|
|
|
aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')] |
|
|
|
|
|
|
|
dat = pd.read_table(is_metrics_file) |
|
|
|
dat = pd.read_table(is_metrics_file,index_col=False) |
|
|
|
is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']] |
|
|
|
is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')] |
|
|
|
|
|
|
|
dat = pd.read_table(quality_yield_file) |
|
|
|
dat = pd.read_table(quality_yield_file,index_col=False) |
|
|
|
dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES'] |
|
|
|
dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES'] |
|
|
|
quality_yield = dat[['Sample','%Q20','%Q30']] |
|
|
|
quality_yield = quality_yield * 100 |
|
|
|
quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')] |
|
|
|
|
|
|
|
dat = pd.read_table(wgs_metrics_file) |
|
|
|
dat = pd.read_table(wgs_metrics_file,index_col=False) |
|
|
|
wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']] |
|
|
|
wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100 |
|
|
|
wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100 |
|
|
|
wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100 |
|
|
|
wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100 |
|
|
|
wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')] |
|
|
|
|
|
|
|
data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics] |
|
|
|
post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames) |
|
|
|
post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X'] |
|
|
|
post_alignment_dat = post_alignment_dat.round(2) |
|
|
|
post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0) |
|
|
|
|
|
|
|
|
|
|
|
# benchmark |
|
|
|
|
|
|
|
######################################### |
|
|
|
# variants calling |
|
|
|
with open(hap_file) as hap_json: |
|
|
|
happy = json.load(hap_json) |
|
|
|
dat =pd.DataFrame.from_records(happy) |
|
|
|
dat = dat.loc[:, dat.columns.str.endswith('ALL')] |
|
|
|
dat_transposed = dat.T |
|
|
|
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']] |
|
|
|
benchmark.columns = ['Sample','Precision','Recall'] |
|
|
|
|
|
|
|
#output |
|
|
|
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0) |
|
|
|
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0) |
|
|
|
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0) |
|
|
|
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0) |
|
|
|
|
|
|
|
dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']] |
|
|
|
indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]] |
|
|
|
snv = dat_transposed[['SNP' in s for s in dat_transposed.index]] |
|
|
|
indel.reset_index(drop=True, inplace=True) |
|
|
|
snv.reset_index(drop=True, inplace=True) |
|
|
|
benchmark = pd.concat([snv, indel], axis=1) |
|
|
|
benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']] |
|
|
|
benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall'] |
|
|
|
benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']] |
|
|
|
benchmark['SNV precision'] = benchmark['SNV precision'].astype(float) |
|
|
|
benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float) |
|
|
|
benchmark['SNV recall'] = benchmark['SNV recall'].astype(float) |
|
|
|
benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float) |
|
|
|
benchmark['SNV precision'] = benchmark['SNV precision'] * 100 |
|
|
|
benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100 |
|
|
|
benchmark['SNV recall'] = benchmark['SNV recall'] * 100 |
|
|
|
benchmark['INDEL recall'] = benchmark['INDEL recall']* 100 |
|
|
|
benchmark = benchmark.round(2) |
|
|
|
benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0) |
|
|
|
|
|
|
|
all_rep = [x.split('_')[6] for x in benchmark['Sample']] |
|
|
|
rep = list(set(all_rep)) |
|
|
|
columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD'] |
|
|
|
df_ = pd.DataFrame(columns=columns) |
|
|
|
for i in rep: |
|
|
|
string = "_" + i + "_" |
|
|
|
sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')] |
|
|
|
mean = list(sub_dat.mean(axis = 0, skipna = True)) |
|
|
|
sd = list(sub_dat.std(axis = 0, skipna = True)) |
|
|
|
family_name = project_name + "." + i + ".SNV" |
|
|
|
df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True) |
|
|
|
family_name = project_name + "." + i + ".INDEL" |
|
|
|
df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True) |
|
|
|
df_ = df_.round(2) |
|
|
|
df_.to_csv('precision.recall.txt',sep="\t",index=0) |
|
|
|
|
|
|
|
|
|
|
|
|