@@ -0,0 +1,113 @@ | |||
import json | |||
import pandas as pd | |||
import sys, argparse, os | |||
parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics") | |||
parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt', required=True) | |||
parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt', required=True) | |||
parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt', required=True) | |||
parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt', required=True) | |||
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True) | |||
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) | |||
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
quality_yield_file = args.quality_yield | |||
wgs_metrics_file = args.wgs_metrics | |||
aln_metrics_file = args.aln_metrics | |||
is_metrics_file = args.is_metrics | |||
fastqc_file = args.fastqc | |||
fastqscreen_file = args.fastqscreen | |||
hap_file = args.happy | |||
############################################# | |||
# fastqc | |||
fastqc = pd.read_table(fastqc_file) | |||
#fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')] | |||
#fastqc.insert(loc=0, column='Sample', value=dat['Sample']) | |||
#fastqc_stat = fastqc.dropna() | |||
# qulimap | |||
#qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')] | |||
#qualimap.insert(loc=0, column='Sample', value=dat['Sample']) | |||
#qualimap_stat = qualimap.dropna() | |||
# fastqc | |||
#dat = pd.read_table(fastqc_file) | |||
#fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"] | |||
#fastqc_module.insert(loc=0, column='Sample', value=dat['Sample']) | |||
#fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample']) | |||
# fastqscreen | |||
dat = pd.read_table(fastqscreen_file) | |||
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')] | |||
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']] | |||
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample']) | |||
# pre-alignment | |||
pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample']) | |||
pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000 | |||
del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails'] | |||
del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length'] | |||
del pre_alignment_dat['ERCC percentage'] | |||
del pre_alignment_dat['Phix percentage'] | |||
del pre_alignment_dat['Mouse percentage'] | |||
pre_alignment_dat = pre_alignment_dat.round(2) | |||
pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits'] | |||
pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0) | |||
############################ | |||
dat = pd.read_table(aln_metrics_file) | |||
dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"] | |||
aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]] | |||
aln_metrics = aln_metrics * 100 | |||
dat = pd.read_table(is_metrics_file) | |||
is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']] | |||
dat = pd.read_table(quality_yield_file) | |||
dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES'] | |||
dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES'] | |||
quality_yield = dat[['Sample','%Q20','%Q30']] | |||
quality_yield = quality_yield * 100 | |||
dat = pd.read_table(wgs_metrics_file) | |||
wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']] | |||
wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100 | |||
wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100 | |||
wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100 | |||
wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100 | |||
data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics] | |||
post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames) | |||
# benchmark | |||
with open(hap_file) as hap_json: | |||
happy = json.load(hap_json) | |||
dat =pd.DataFrame.from_records(happy) | |||
dat = dat.loc[:, dat.columns.str.endswith('ALL')] | |||
dat_transposed = dat.T | |||
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']] | |||
benchmark.columns = ['Sample','Precision','Recall'] | |||
#output | |||
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0) | |||
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0) | |||
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0) | |||
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0) | |||
@@ -1,18 +1,22 @@ | |||
task extract_tables { | |||
File quality_yield_summary | |||
File wgs_metrics_summary | |||
File aln_metrics_summary | |||
File is_metrics_summary | |||
File fastqc | |||
File fastqscreen | |||
File hap | |||
File aln | |||
File quality_yield | |||
File wgs_metrics | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap} | |||
>>> | |||
runtime { |
@@ -1,43 +0,0 @@ | |||
task mergeSentieon { | |||
Array[File] aln_metrics_header | |||
Array[File] aln_metrics_data | |||
Array[File] is_metrics_header | |||
Array[File] is_metrics_data | |||
Array[File] quality_yield_header | |||
Array[File] quality_yield_data | |||
Array[File] wgs_metrics_header | |||
Array[File] wgs_metrics_data | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
set -o pipefail | |||
set -e | |||
echo '''Sample''' > sample_column | |||
cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics | |||
ls ${sep=" " aln_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - aln_metrics > aln_metrics.txt | |||
cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics.txt | |||
cat ${sep=" " quality_yield_header} | sed -n '1,1p' | cat - ${sep=" " quality_yield_data} > quality_yield_data.txt | |||
cat ${sep=" " wgs_metrics_header} | sed -n '1,1p' | cat - ${sep=" " wgs_metrics_data} > wgs_metrics_data.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster:cluster_config | |||
systemDisk:"cloud_ssd 40" | |||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File aln_metrics_merge = "aln_metrics.txt" | |||
File is_metrics_merge = "is_metrics.txt" | |||
File quality_yield_merge = "quality_yield_data.txt" | |||
File wgs_metrics_merge = "wgs_metrics_data.txt" | |||
} | |||
} |
@@ -1,4 +1,4 @@ | |||
task merge_family { | |||
task merge_family { | |||
Array[File] splited_vcf | |||
String project | |||
String docker |
@@ -0,0 +1,48 @@ | |||
task merge_sentieon_metrics { | |||
Array[File] quality_yield_header | |||
Array[File] wgs_metrics_algo_header | |||
Array[File] aln_metrics_header | |||
Array[File] is_metrics_header | |||
Array[File] quality_yield_data | |||
Array[File] wgs_metrics_algo_data | |||
Array[File] aln_metrics_data | |||
Array[File] is_metrics_data | |||
String project | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
echo '''Sample''' > sample_column | |||
cat ${sep=" " quality_yield_header} | sed -n '1,1p' | cat - ${sep=" " quality_yield_data} > quality_yield_all | |||
ls ${sep=" " quality_yield_data} | cut -d '.' -f1 | cat sample_column - | paste - quality_yield_all > ${project}.quality_yield.txt | |||
cat ${sep=" " wgs_metrics_algo_header} | sed -n '1,1p' | cat - ${sep=" " wgs_metrics_algo_data} > wgs_metrics_all | |||
ls ${sep=" " wgs_metrics_algo_data} | cut -d '.' -f1 | cat sample_column - | paste - wgs_metrics_all > ${project}.wgs_metrics_data.txt | |||
cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics_all | |||
ls ${sep=" " aln_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - aln_metrics_all > ${project}.aln_metrics.txt | |||
cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics_all | |||
ls ${sep=" " is_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - is_metrics_all > ${project}.is_metrics.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File quality_yield_summary = "${project}.quality_yield.txt" | |||
File wgs_metrics_summary = "${project}.wgs_metrics_data.txt" | |||
File aln_metrics_summary = "${project}.aln_metrics.txt" | |||
File is_metrics_summary = "${project}.is_metrics.txt" | |||
} | |||
} |
@@ -1,8 +1,10 @@ | |||
task sentieon { | |||
File quality_yield | |||
File wgs_metrics_algo | |||
File aln_metrics | |||
File is_metrics | |||
File wgsmetrics | |||
File quality_yield | |||
String sample | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
@@ -10,14 +12,19 @@ task sentieon { | |||
command <<< | |||
set -o pipefail | |||
set -e | |||
cat ${quality_yield} | sed -n '2,2p' > quality_yield.header | |||
cat ${quality_yield} | sed -n '3,3p' > ${sample}.quality_yield | |||
cat ${wgs_metrics_algo} | sed -n '2,2p' > wgs_metrics_algo.header | |||
cat ${wgs_metrics_algo} | sed -n '3,3p' > ${sample}.wgs_metrics_algo | |||
cat ${aln_metrics} | sed -n '2,2p' > aln_metrics.header | |||
cat ${aln_metrics} | sed -n '5,5p' > ${sample_name}.aln_metrics | |||
cat ${dedup_metrics} | sed -n '2,2p' > dedup_metrics.header | |||
cat ${dedup_metrics} | sed -n '3,3p' > ${sample_name}.dedup_metrics | |||
cat ${aln_metrics} | sed -n '5,5p' > ${sample}.aln_metrics | |||
cat ${is_metrics} | sed -n '2,2p' > is_metrics.header | |||
cat ${is_metrics} | sed -n '3,3p' > ${sample_name}.is_metrics | |||
cat ${deduped_coverage} | sed -n '1,1p' > deduped_coverage.header | |||
cat ${deduped_coverage} | sed -n '2,2p' > ${sample_name}.deduped_coverage | |||
cat ${is_metrics} | sed -n '3,3p' > ${sample}.is_metrics | |||
>>> | |||
runtime { | |||
@@ -28,13 +35,13 @@ task sentieon { | |||
} | |||
output { | |||
File quality_yield_header = "quality_yield.header" | |||
File quality_yield_data = "${sample}.quality_yield" | |||
File wgs_metrics_algo_header = "wgs_metrics_algo.header" | |||
File wgs_metrics_algo_data = "${sample}.wgs_metrics_algo" | |||
File aln_metrics_header = "aln_metrics.header" | |||
File aln_metrics_data = "${sample_name}.aln_metrics" | |||
File dedup_metrics_header = "dedup_metrics.header" | |||
File dedup_metrics_data = "${sample_name}.dedup_metrics" | |||
File aln_metrics_data = "${sample}.aln_metrics" | |||
File is_metrics_header = "is_metrics.header" | |||
File is_metrics_data = "${sample_name}.is_metrics" | |||
File deduped_coverage_header = "deduped_coverage.header" | |||
File deduped_coverage_data = "${sample_name}.deduped_coverage" | |||
File is_metrics_data = "${sample}.is_metrics" | |||
} | |||
} |
@@ -1,6 +1,7 @@ | |||
import "./tasks/mapping.wdl" as mapping | |||
import "./tasks/Dedup.wdl" as Dedup | |||
import "./tasks/deduped_Metrics.wdl" as deduped_Metrics | |||
import "./tasks/sentieon.wdl" as sentieon | |||
import "./tasks/Realigner.wdl" as Realigner | |||
import "./tasks/BQSR.wdl" as BQSR | |||
import "./tasks/Haplotyper_gVCF.wdl" as Haplotyper_gVCF | |||
@@ -8,6 +9,7 @@ import "./tasks/GVCFtyper.wdl" as GVCFtyper | |||
import "./tasks/split_gvcf_files.wdl" as split_gvcf_files | |||
import "./tasks/benchmark.wdl" as benchmark | |||
import "./tasks/multiqc.wdl" as multiqc | |||
import "./tasks/merge_sentieon_metrics.wdl" as merge_sentieon_metrics | |||
import "./tasks/mendelian.wdl" as mendelian | |||
import "./tasks/merge_mendelian.wdl" as merge_mendelian | |||
import "./tasks/quartet_mendelian.wdl" as quartet_mendelian | |||
@@ -112,6 +114,19 @@ workflow {{ project_name }} { | |||
disk_size=disk_size, | |||
cluster_config=BIGcluster_config | |||
} | |||
call sentieon.sentieon as sentieon { | |||
input: | |||
quality_yield=deduped_Metrics.deduped_QualityYield, | |||
wgs_metrics_algo=deduped_Metrics.deduped_wgsmetrics, | |||
aln_metrics=deduped_Metrics.dedeuped_aln_metrics, | |||
is_metrics=deduped_Metrics.deduped_is_metrics, | |||
sample=quartet[2], | |||
docker=SENTIEONdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
call Realigner.Realigner as Realigner { | |||
input: | |||
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR, | |||
@@ -209,18 +224,35 @@ workflow {{ project_name }} { | |||
disk_size=disk_size | |||
} | |||
call merge_sentieon_metrics.merge_sentieon_metrics as merge_sentieon_metrics { | |||
input: | |||
quality_yield_header=sentieon.quality_yield_header, | |||
wgs_metrics_algo_header=sentieon.wgs_metrics_algo_header, | |||
aln_metrics_header=sentieon.aln_metrics_header, | |||
is_metrics_header=sentieon.is_metrics_header, | |||
quality_yield_data=sentieon.quality_yield_data, | |||
wgs_metrics_algo_data=sentieon.wgs_metrics_algo_data, | |||
aln_metrics_data=sentieon.aln_metrics_data, | |||
is_metrics_data=sentieon.is_metrics_data, | |||
project=project, | |||
docker=MULTIQCdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
call extract_tables.extract_tables as extract_tables { | |||
input: | |||
quality_yield_summary=merge_sentieon_metrics.quality_yield_summary, | |||
wgs_metrics_summary=merge_sentieon_metrics.wgs_metrics_summary, | |||
aln_metrics_summary=merge_sentieon_metrics.aln_metrics_summary, | |||
is_metrics_summary=merge_sentieon_metrics.is_metrics_summary, | |||
fastqc=multiqc.fastqc, | |||
fastqscreen=multiqc.fastqscreen, | |||
hap=multiqc.hap, | |||
aln=deduped_Metrics.dedeuped_aln_metrics, | |||
quality_yield=deduped_Metrics.deduped_QualityYield, | |||
wgs_metrics=deduped_Metrics.deduped_wgsmetrics, | |||
docker=DIYdocker, | |||
docker=MULTIQCdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
} | |||
Boolean sister_tag = read_boolean(split_gvcf_files.sister_tag) | |||
Boolean quartet_tag = read_boolean(split_gvcf_files.quartet_tag) |