import json | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
parser = argparse.ArgumentParser(description="This script is to get information from multiqc") | |||||
parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt', required=True) | |||||
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True) | |||||
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) | |||||
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) | |||||
args = parser.parse_args() | |||||
# Rename input: | |||||
fastqc_qualimap_file = args.fastqc_qualimap | |||||
fastqc_file = args.fastqc | |||||
fastqscreen_file = args.fastqscreen | |||||
hap_file = args.happy | |||||
# fastqc and qualimap | |||||
dat = pd.read_table(fastqc_qualimap_file) | |||||
fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')] | |||||
fastqc.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
fastqc_stat = fastqc.dropna() | |||||
# qulimap | |||||
qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')] | |||||
qualimap.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
qualimap_stat = qualimap.dropna() | |||||
# fastqc | |||||
dat = pd.read_table(fastqc_file) | |||||
fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"] | |||||
fastqc_module.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample']) | |||||
# fastqscreen | |||||
dat = pd.read_table(fastqscreen_file) | |||||
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')] | |||||
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']] | |||||
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
# benchmark | |||||
with open(hap_file) as hap_json: | |||||
happy = json.load(hap_json) | |||||
dat =pd.DataFrame.from_records(happy) | |||||
dat = dat.loc[:, dat.columns.str.endswith('ALL')] | |||||
dat_transposed = dat.T | |||||
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']] | |||||
benchmark.columns = ['Sample','Precision','Recall'] | |||||
#output | |||||
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0) | |||||
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0) | |||||
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0) | |||||
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0) | |||||
task Haplotyper_gVCF { | task Haplotyper_gVCF { | ||||
File ref_dir | File ref_dir | ||||
String SENTIEON_INSTALL_DIR | String SENTIEON_INSTALL_DIR | ||||
String fasta | String fasta |
task extract_multiqc { | |||||
File fastqc_qualimap | |||||
File fastqc | |||||
File fastqscreen | |||||
File hap | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc_qualimap} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File fastqc_result = "fastqc.final.result.txt" | |||||
File fastqscreen_result = "fastqscreen.final.result.txt" | |||||
File qualimap_result = "qualimap.final.result.txt" | |||||
File hap_result = "benchmark.final.result.txt" | |||||
} | |||||
} |
task mendelian { | task mendelian { | ||||
File family_vcf | File family_vcf | ||||
File ref_dir | File ref_dir | ||||
String family_name = basename(family_vcf,".family.vcf") | |||||
String fasta | String fasta | ||||
String project | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
export LD_LIBRARY_PATH=/opt/htslib-1.9 | export LD_LIBRARY_PATH=/opt/htslib-1.9 | ||||
nt=$(nproc) | nt=$(nproc) | ||||
echo -e "${project}\tLCL8\t0\t0\t2\t-9\n${project}\tLCL7\t0\t0\t1\t-9\n${project}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${project}.D5.ped | |||||
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped | |||||
mkdir VBT_D5 | mkdir VBT_D5 | ||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${project}.D5.ped -outDir VBT_D5 -out-prefix ${project}.D5 --output-violation-regions -thread-count $nt | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt | |||||
cat VBT_D5/${project}.D5_trio.vcf > ${project}.D5.vcf | |||||
cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf | |||||
echo -e "${project}\tLCL8\t0\t0\t2\t-9\n${project}\tLCL7\t0\t0\t1\t-9\n${project}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${project}.D6.ped | |||||
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped | |||||
mkdir VBT_D6 | mkdir VBT_D6 | ||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${project}.D6.ped -outDir VBT_D6 -out-prefix ${project}.D6 --output-violation-regions -thread-count $nt | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt | |||||
cat VBT_D6/${project}.D6_trio.vcf > ${project}.D6.vcf | |||||
cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File D5_ped = "${project}.D5.ped" | |||||
File D6_ped = "${project}.D6.ped" | |||||
File D5_ped = "${family_name}.D5.ped" | |||||
File D6_ped = "${family_name}.D6.ped" | |||||
Array[File] D5_mendelian = glob("VBT_D5/*") | Array[File] D5_mendelian = glob("VBT_D5/*") | ||||
Array[File] D6_mendelian = glob("VBT_D6/*") | Array[File] D6_mendelian = glob("VBT_D6/*") | ||||
File D5_trio_vcf = "${project}.D5.vcf" | |||||
File D6_trio_vcf = "${project}.D6.vcf" | |||||
File family_all_vcf = "${project}.vcf" | |||||
File D5_trio_vcf = "${family_name}.D5.vcf" | |||||
File D6_trio_vcf = "${family_name}.D6.vcf" | |||||
} | } | ||||
} | } | ||||
File D5_trio_vcf | File D5_trio_vcf | ||||
File D6_trio_vcf | File D6_trio_vcf | ||||
File family_vcf | File family_vcf | ||||
String project | |||||
String family_name = basename(family_vcf,".family.vcf") | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
cat ${D5_trio_vcf} | grep -v '##' > ${project}.D5.txt | |||||
cat ${D6_trio_vcf} | grep -v '##' > ${project}.D6.txt | |||||
cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt | |||||
cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt | |||||
cat ${family_vcf} | grep -v '##' | awk ' | cat ${family_vcf} | grep -v '##' | awk ' | ||||
BEGIN { OFS = "\t" } | BEGIN { OFS = "\t" } | ||||
NF > 2 && FNR > 1 { | NF > 2 && FNR > 1 { | ||||
} | } | ||||
} | } | ||||
{ print } | { print } | ||||
' > ${project}.consensus.txt | |||||
python /opt/merge_two_family_with_genotype.py -LCL5 ${project}.D5.txt -LCL6 ${project}.D6.txt -genotype ${project}.consensus.txt -family ${project}.mendelian | |||||
' > ${family_name}.consensus.txt | |||||
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name} | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File project_mendelian = "${project}.mendelian.txt" | |||||
File project_mendelian_summary = "${project}.mendelian.summary.txt" | |||||
File project_mendelian = "${family_name}.mendelian.txt" | |||||
File project_mendelian_summary = "${family_name}.mendelian.summary.txt" | |||||
} | } | ||||
} | } |
done | done | ||||
multiqc /cromwell_root/tmp/ | multiqc /cromwell_root/tmp/ | ||||
cp multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt | |||||
cp multiqc_data/multiqc_fastqc.txt > multiqc_fastqc.txt | |||||
cp multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt | |||||
cp multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json | |||||
>>> | >>> | ||||
output { | output { | ||||
File multiqc_html = "multiqc_report.html" | File multiqc_html = "multiqc_report.html" | ||||
Array[File] multiqc_txt = glob("multiqc_data/*") | Array[File] multiqc_txt = glob("multiqc_data/*") | ||||
File fastqc_qualimap = "multiqc_general_stats.txt" | |||||
File fastqc = "multiqc_fastqc.txt" | |||||
File fastqscreen = "multiqc_fastq_screen.txt" | |||||
File hap = "multiqc_happy_data.json" | |||||
} | } | ||||
} | } |
import "./tasks/fastqc.wdl" as fastqc | import "./tasks/fastqc.wdl" as fastqc | ||||
import "./tasks/fastqscreen.wdl" as fastqscreen | import "./tasks/fastqscreen.wdl" as fastqscreen | ||||
import "./tasks/qualimap.wdl" as qualimap | import "./tasks/qualimap.wdl" as qualimap | ||||
import "./tasks/extract_multiqc.wdl" as extract_multiqc | |||||
workflow {{ project_name }} { | workflow {{ project_name }} { | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call extract_multiqc.extract_multiqc as extract_multiqc { | |||||
input: | |||||
fastqc_qualimap=multiqc.fastqc_qualimap, | |||||
fastqc=multiqc.fastqc, | |||||
fastqscreen=multiqc.fastqscreen, | |||||
hap=multiqc.hap, | |||||
docker=DIYdocker, | |||||
cluster_config=SMALLcluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
Array[File] family_vcfs = split_gvcf_files.family_vcf | Array[File] family_vcfs = split_gvcf_files.family_vcf | ||||
scatter (idx in range(length(family_vcfs))) { | scatter (idx in range(length(family_vcfs))) { | ||||
family_vcf=family_vcfs[idx], | family_vcf=family_vcfs[idx], | ||||
ref_dir=ref_dir, | ref_dir=ref_dir, | ||||
fasta=fasta, | fasta=fasta, | ||||
project=project, | |||||
docker=MENDELIANdocker, | docker=MENDELIANdocker, | ||||
cluster_config=BIGcluster_config, | cluster_config=BIGcluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
D5_trio_vcf=mendelian.D5_trio_vcf, | D5_trio_vcf=mendelian.D5_trio_vcf, | ||||
D6_trio_vcf=mendelian.D6_trio_vcf, | D6_trio_vcf=mendelian.D6_trio_vcf, | ||||
family_vcf=family_vcfs[idx], | family_vcf=family_vcfs[idx], | ||||
project=project, | |||||
docker=DIYdocker, | docker=DIYdocker, | ||||
cluster_config=SMALLcluster_config, | cluster_config=SMALLcluster_config, | ||||
disk_size=disk_size | disk_size=disk_size |