import json | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics") | |||||
parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt', required=True) | |||||
parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt', required=True) | |||||
parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt', required=True) | |||||
parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt', required=True) | |||||
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True) | |||||
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) | |||||
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) | |||||
args = parser.parse_args() | |||||
# Rename input: | |||||
quality_yield_file = args.quality_yield | |||||
wgs_metrics_file = args.wgs_metrics | |||||
aln_metrics_file = args.aln_metrics | |||||
is_metrics_file = args.is_metrics | |||||
fastqc_file = args.fastqc | |||||
fastqscreen_file = args.fastqscreen | |||||
hap_file = args.happy | |||||
############################################# | |||||
# fastqc | |||||
fastqc = pd.read_table(fastqc_file) | |||||
#fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')] | |||||
#fastqc.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
#fastqc_stat = fastqc.dropna() | |||||
# qulimap | |||||
#qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')] | |||||
#qualimap.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
#qualimap_stat = qualimap.dropna() | |||||
# fastqc | |||||
#dat = pd.read_table(fastqc_file) | |||||
#fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"] | |||||
#fastqc_module.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
#fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample']) | |||||
# fastqscreen | |||||
dat = pd.read_table(fastqscreen_file) | |||||
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')] | |||||
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']] | |||||
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
# pre-alignment | |||||
pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample']) | |||||
pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000 | |||||
del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails'] | |||||
del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length'] | |||||
del pre_alignment_dat['ERCC percentage'] | |||||
del pre_alignment_dat['Phix percentage'] | |||||
del pre_alignment_dat['Mouse percentage'] | |||||
pre_alignment_dat = pre_alignment_dat.round(2) | |||||
pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits'] | |||||
pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0) | |||||
############################ | |||||
dat = pd.read_table(aln_metrics_file) | |||||
dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"] | |||||
aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]] | |||||
aln_metrics = aln_metrics * 100 | |||||
dat = pd.read_table(is_metrics_file) | |||||
is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']] | |||||
dat = pd.read_table(quality_yield_file) | |||||
dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES'] | |||||
dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES'] | |||||
quality_yield = dat[['Sample','%Q20','%Q30']] | |||||
quality_yield = quality_yield * 100 | |||||
dat = pd.read_table(wgs_metrics_file) | |||||
wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']] | |||||
wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100 | |||||
wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100 | |||||
wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100 | |||||
wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100 | |||||
data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics] | |||||
post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames) | |||||
# benchmark | |||||
with open(hap_file) as hap_json: | |||||
happy = json.load(hap_json) | |||||
dat =pd.DataFrame.from_records(happy) | |||||
dat = dat.loc[:, dat.columns.str.endswith('ALL')] | |||||
dat_transposed = dat.T | |||||
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']] | |||||
benchmark.columns = ['Sample','Precision','Recall'] | |||||
#output | |||||
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0) | |||||
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0) | |||||
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0) | |||||
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0) | |||||
task extract_tables { | task extract_tables { | ||||
File quality_yield_summary | |||||
File wgs_metrics_summary | |||||
File aln_metrics_summary | |||||
File is_metrics_summary | |||||
File fastqc | File fastqc | ||||
File fastqscreen | File fastqscreen | ||||
File hap | File hap | ||||
File aln | |||||
File quality_yield | |||||
File wgs_metrics | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap} | |||||
>>> | >>> | ||||
runtime { | runtime { |
task mergeSentieon { | |||||
Array[File] aln_metrics_header | |||||
Array[File] aln_metrics_data | |||||
Array[File] is_metrics_header | |||||
Array[File] is_metrics_data | |||||
Array[File] quality_yield_header | |||||
Array[File] quality_yield_data | |||||
Array[File] wgs_metrics_header | |||||
Array[File] wgs_metrics_data | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
echo '''Sample''' > sample_column | |||||
cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics | |||||
ls ${sep=" " aln_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - aln_metrics > aln_metrics.txt | |||||
cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics.txt | |||||
cat ${sep=" " quality_yield_header} | sed -n '1,1p' | cat - ${sep=" " quality_yield_data} > quality_yield_data.txt | |||||
cat ${sep=" " wgs_metrics_header} | sed -n '1,1p' | cat - ${sep=" " wgs_metrics_data} > wgs_metrics_data.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File aln_metrics_merge = "aln_metrics.txt" | |||||
File is_metrics_merge = "is_metrics.txt" | |||||
File quality_yield_merge = "quality_yield_data.txt" | |||||
File wgs_metrics_merge = "wgs_metrics_data.txt" | |||||
} | |||||
} |
task merge_family { | |||||
task merge_family { | |||||
Array[File] splited_vcf | Array[File] splited_vcf | ||||
String project | String project | ||||
String docker | String docker |
task merge_sentieon_metrics { | |||||
Array[File] quality_yield_header | |||||
Array[File] wgs_metrics_algo_header | |||||
Array[File] aln_metrics_header | |||||
Array[File] is_metrics_header | |||||
Array[File] quality_yield_data | |||||
Array[File] wgs_metrics_algo_data | |||||
Array[File] aln_metrics_data | |||||
Array[File] is_metrics_data | |||||
String project | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
echo '''Sample''' > sample_column | |||||
cat ${sep=" " quality_yield_header} | sed -n '1,1p' | cat - ${sep=" " quality_yield_data} > quality_yield_all | |||||
ls ${sep=" " quality_yield_data} | cut -d '.' -f1 | cat sample_column - | paste - quality_yield_all > ${project}.quality_yield.txt | |||||
cat ${sep=" " wgs_metrics_algo_header} | sed -n '1,1p' | cat - ${sep=" " wgs_metrics_algo_data} > wgs_metrics_all | |||||
ls ${sep=" " wgs_metrics_algo_data} | cut -d '.' -f1 | cat sample_column - | paste - wgs_metrics_all > ${project}.wgs_metrics_data.txt | |||||
cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics_all | |||||
ls ${sep=" " aln_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - aln_metrics_all > ${project}.aln_metrics.txt | |||||
cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics_all | |||||
ls ${sep=" " is_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - is_metrics_all > ${project}.is_metrics.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File quality_yield_summary = "${project}.quality_yield.txt" | |||||
File wgs_metrics_summary = "${project}.wgs_metrics_data.txt" | |||||
File aln_metrics_summary = "${project}.aln_metrics.txt" | |||||
File is_metrics_summary = "${project}.is_metrics.txt" | |||||
} | |||||
} |
task sentieon { | task sentieon { | ||||
File quality_yield | |||||
File wgs_metrics_algo | |||||
File aln_metrics | File aln_metrics | ||||
File is_metrics | File is_metrics | ||||
File wgsmetrics | |||||
File quality_yield | |||||
String sample | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
set -o pipefail | set -o pipefail | ||||
set -e | set -e | ||||
cat ${quality_yield} | sed -n '2,2p' > quality_yield.header | |||||
cat ${quality_yield} | sed -n '3,3p' > ${sample}.quality_yield | |||||
cat ${wgs_metrics_algo} | sed -n '2,2p' > wgs_metrics_algo.header | |||||
cat ${wgs_metrics_algo} | sed -n '3,3p' > ${sample}.wgs_metrics_algo | |||||
cat ${aln_metrics} | sed -n '2,2p' > aln_metrics.header | cat ${aln_metrics} | sed -n '2,2p' > aln_metrics.header | ||||
cat ${aln_metrics} | sed -n '5,5p' > ${sample_name}.aln_metrics | |||||
cat ${dedup_metrics} | sed -n '2,2p' > dedup_metrics.header | |||||
cat ${dedup_metrics} | sed -n '3,3p' > ${sample_name}.dedup_metrics | |||||
cat ${aln_metrics} | sed -n '5,5p' > ${sample}.aln_metrics | |||||
cat ${is_metrics} | sed -n '2,2p' > is_metrics.header | cat ${is_metrics} | sed -n '2,2p' > is_metrics.header | ||||
cat ${is_metrics} | sed -n '3,3p' > ${sample_name}.is_metrics | |||||
cat ${deduped_coverage} | sed -n '1,1p' > deduped_coverage.header | |||||
cat ${deduped_coverage} | sed -n '2,2p' > ${sample_name}.deduped_coverage | |||||
cat ${is_metrics} | sed -n '3,3p' > ${sample}.is_metrics | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
} | } | ||||
output { | output { | ||||
File quality_yield_header = "quality_yield.header" | |||||
File quality_yield_data = "${sample}.quality_yield" | |||||
File wgs_metrics_algo_header = "wgs_metrics_algo.header" | |||||
File wgs_metrics_algo_data = "${sample}.wgs_metrics_algo" | |||||
File aln_metrics_header = "aln_metrics.header" | File aln_metrics_header = "aln_metrics.header" | ||||
File aln_metrics_data = "${sample_name}.aln_metrics" | |||||
File dedup_metrics_header = "dedup_metrics.header" | |||||
File dedup_metrics_data = "${sample_name}.dedup_metrics" | |||||
File aln_metrics_data = "${sample}.aln_metrics" | |||||
File is_metrics_header = "is_metrics.header" | File is_metrics_header = "is_metrics.header" | ||||
File is_metrics_data = "${sample_name}.is_metrics" | |||||
File deduped_coverage_header = "deduped_coverage.header" | |||||
File deduped_coverage_data = "${sample_name}.deduped_coverage" | |||||
File is_metrics_data = "${sample}.is_metrics" | |||||
} | } | ||||
} | } |
import "./tasks/mapping.wdl" as mapping | import "./tasks/mapping.wdl" as mapping | ||||
import "./tasks/Dedup.wdl" as Dedup | import "./tasks/Dedup.wdl" as Dedup | ||||
import "./tasks/deduped_Metrics.wdl" as deduped_Metrics | import "./tasks/deduped_Metrics.wdl" as deduped_Metrics | ||||
import "./tasks/sentieon.wdl" as sentieon | |||||
import "./tasks/Realigner.wdl" as Realigner | import "./tasks/Realigner.wdl" as Realigner | ||||
import "./tasks/BQSR.wdl" as BQSR | import "./tasks/BQSR.wdl" as BQSR | ||||
import "./tasks/Haplotyper_gVCF.wdl" as Haplotyper_gVCF | import "./tasks/Haplotyper_gVCF.wdl" as Haplotyper_gVCF | ||||
import "./tasks/split_gvcf_files.wdl" as split_gvcf_files | import "./tasks/split_gvcf_files.wdl" as split_gvcf_files | ||||
import "./tasks/benchmark.wdl" as benchmark | import "./tasks/benchmark.wdl" as benchmark | ||||
import "./tasks/multiqc.wdl" as multiqc | import "./tasks/multiqc.wdl" as multiqc | ||||
import "./tasks/merge_sentieon_metrics.wdl" as merge_sentieon_metrics | |||||
import "./tasks/mendelian.wdl" as mendelian | import "./tasks/mendelian.wdl" as mendelian | ||||
import "./tasks/merge_mendelian.wdl" as merge_mendelian | import "./tasks/merge_mendelian.wdl" as merge_mendelian | ||||
import "./tasks/quartet_mendelian.wdl" as quartet_mendelian | import "./tasks/quartet_mendelian.wdl" as quartet_mendelian | ||||
disk_size=disk_size, | disk_size=disk_size, | ||||
cluster_config=BIGcluster_config | cluster_config=BIGcluster_config | ||||
} | } | ||||
call sentieon.sentieon as sentieon { | |||||
input: | |||||
quality_yield=deduped_Metrics.deduped_QualityYield, | |||||
wgs_metrics_algo=deduped_Metrics.deduped_wgsmetrics, | |||||
aln_metrics=deduped_Metrics.dedeuped_aln_metrics, | |||||
is_metrics=deduped_Metrics.deduped_is_metrics, | |||||
sample=quartet[2], | |||||
docker=SENTIEONdocker, | |||||
cluster_config=SMALLcluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call Realigner.Realigner as Realigner { | call Realigner.Realigner as Realigner { | ||||
input: | input: | ||||
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR, | SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call merge_sentieon_metrics.merge_sentieon_metrics as merge_sentieon_metrics { | |||||
input: | |||||
quality_yield_header=sentieon.quality_yield_header, | |||||
wgs_metrics_algo_header=sentieon.wgs_metrics_algo_header, | |||||
aln_metrics_header=sentieon.aln_metrics_header, | |||||
is_metrics_header=sentieon.is_metrics_header, | |||||
quality_yield_data=sentieon.quality_yield_data, | |||||
wgs_metrics_algo_data=sentieon.wgs_metrics_algo_data, | |||||
aln_metrics_data=sentieon.aln_metrics_data, | |||||
is_metrics_data=sentieon.is_metrics_data, | |||||
project=project, | |||||
docker=MULTIQCdocker, | |||||
cluster_config=SMALLcluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call extract_tables.extract_tables as extract_tables { | call extract_tables.extract_tables as extract_tables { | ||||
input: | input: | ||||
quality_yield_summary=merge_sentieon_metrics.quality_yield_summary, | |||||
wgs_metrics_summary=merge_sentieon_metrics.wgs_metrics_summary, | |||||
aln_metrics_summary=merge_sentieon_metrics.aln_metrics_summary, | |||||
is_metrics_summary=merge_sentieon_metrics.is_metrics_summary, | |||||
fastqc=multiqc.fastqc, | fastqc=multiqc.fastqc, | ||||
fastqscreen=multiqc.fastqscreen, | fastqscreen=multiqc.fastqscreen, | ||||
hap=multiqc.hap, | hap=multiqc.hap, | ||||
aln=deduped_Metrics.dedeuped_aln_metrics, | |||||
quality_yield=deduped_Metrics.deduped_QualityYield, | |||||
wgs_metrics=deduped_Metrics.deduped_wgsmetrics, | |||||
docker=DIYdocker, | |||||
docker=MULTIQCdocker, | |||||
cluster_config=SMALLcluster_config, | cluster_config=SMALLcluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | |||||
} | |||||
Boolean sister_tag = read_boolean(split_gvcf_files.sister_tag) | Boolean sister_tag = read_boolean(split_gvcf_files.sister_tag) | ||||
Boolean quartet_tag = read_boolean(split_gvcf_files.quartet_tag) | Boolean quartet_tag = read_boolean(split_gvcf_files.quartet_tag) |