import json | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
parser = argparse.ArgumentParser(description="This script is to get information from multiqc") | |||||
parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt', required=True) | |||||
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True) | |||||
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) | |||||
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) | |||||
args = parser.parse_args() | |||||
# Rename input: | |||||
fastqc_qualimap_file = args.fastqc_qualimap | |||||
fastqc_file = args.fastqc | |||||
fastqscreen_file = args.fastqscreen | |||||
hap_file = args.happy | |||||
# fastqc and qualimap | |||||
dat = pd.read_table(fastqc_qualimap_file) | |||||
fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')] | |||||
fastqc.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
fastqc_stat = fastqc.dropna() | |||||
# qulimap | |||||
qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')] | |||||
qualimap.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
qualimap_stat = qualimap.dropna() | |||||
# fastqc | |||||
dat = pd.read_table(fastqc_file) | |||||
fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"] | |||||
fastqc_module.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample']) | |||||
# fastqscreen | |||||
dat = pd.read_table(fastqscreen_file) | |||||
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')] | |||||
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']] | |||||
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
# benchmark | |||||
with open(hap_file) as hap_json: | |||||
happy = json.load(hap_json) | |||||
dat =pd.DataFrame.from_records(happy) | |||||
dat = dat.loc[:, dat.columns.str.endswith('ALL')] | |||||
dat_transposed = dat.T | |||||
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']] | |||||
benchmark.columns = ['Sample','Precision','Recall'] | |||||
#output | |||||
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0) | |||||
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0) | |||||
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0) | |||||
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0) | |||||
{ | { | ||||
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript_v3.0/reference_dataset_v4.0/", | |||||
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript_v3.0/reference_dataset_v202011/", | |||||
"{{ project_name }}.SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics", | "{{ project_name }}.SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics", | ||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | ||||
"{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | "{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | ||||
"{{ project_name }}.gvcf": {{ gvcf_list.split(";") | tojson }}, | "{{ project_name }}.gvcf": {{ gvcf_list.split(";") | tojson }}, | ||||
"{{ project_name }}.gvcf_idx": {{ gvcf_idx_list.split(";") | tojson }}, | "{{ project_name }}.gvcf_idx": {{ gvcf_idx_list.split(";") | tojson }}, | ||||
"{{ project_name }}.disk_size": "500", | "{{ project_name }}.disk_size": "500", | ||||
"{{ project_name }}.del_bed": "oss://pgx-result/renluyao/manuscript_v3.0/reference_dataset_v202011/Tier1.DEL", | |||||
"{{ project_name }}.project": "{{ project }}", | "{{ project_name }}.project": "{{ project }}", | ||||
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc", | "{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc", | ||||
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc", | "{{ project_name }}.BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc", |
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz | /opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz | ||||
if [[ ${sample} =~ "LCL5" ]];then | if [[ ${sample} =~ "LCL5" ]];then | ||||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL5.voted.mendelian.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL5.highconfidence.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL5.ref.v20201103.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.callable.voted.collapse.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||||
elif [[ ${sample} =~ "LCL6" ]]; then | elif [[ ${sample} =~ "LCL6" ]]; then | ||||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL6.voted.mendelian.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL6.highconfidence.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL6.ref.v20201103.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.callable.voted.collapse.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||||
elif [[ ${sample} =~ "LCL7" ]]; then | elif [[ ${sample} =~ "LCL7" ]]; then | ||||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL7.voted.mendelian.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL7.highconfidence.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL7.ref.v20201103.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.callable.voted.collapse.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||||
elif [[ ${sample} =~ "LCL8" ]]; then | elif [[ ${sample} =~ "LCL8" ]]; then | ||||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL8.voted.mendelian.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL8.highconfidence.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL8.ref.v20201103.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.callable.voted.collapse.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||||
else | else | ||||
echo "only for quartet samples" | echo "only for quartet samples" | ||||
fi | fi |
task filtered { | |||||
File raw_vcf | |||||
File del_bed | |||||
String family_name = basename(raw_vcf,".family.vcf") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip ${raw_vcf} -c > ${family_name}.rtg.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${family_name}.rtg.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${family_name}.rtg.vcf.gz -o ${family_name}.noDEL.vcf.gz --exclude-bed=${del_bed} | |||||
gunzip ${family_name}.noDEL.vcf.gz | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File noDEL_vcf="${family_name}.noDEL.vcf" | |||||
} | |||||
} |
task mendelian { | task mendelian { | ||||
File family_vcf | File family_vcf | ||||
File ref_dir | File ref_dir | ||||
String family_name = basename(family_vcf,".family.vcf") | |||||
String family_name = basename(family_vcf,".noDEL.vcf") | |||||
String fasta | String fasta | ||||
String docker | String docker | ||||
String cluster_config | String cluster_config |
import "./tasks/split_gvcf_files.wdl" as split_gvcf_files | import "./tasks/split_gvcf_files.wdl" as split_gvcf_files | ||||
import "./tasks/GVCFtyper.wdl" as GVCFtyper | import "./tasks/GVCFtyper.wdl" as GVCFtyper | ||||
import "./tasks/benchmark.wdl" as benchmark | import "./tasks/benchmark.wdl" as benchmark | ||||
import "./tasks/filtered.wdl" as filtered | |||||
import "./tasks/mendelian.wdl" as mendelian | import "./tasks/mendelian.wdl" as mendelian | ||||
import "./tasks/merge_mendelian.wdl" as merge_mendelian | import "./tasks/merge_mendelian.wdl" as merge_mendelian | ||||
import "./tasks/quartet_mendelian.wdl" as quartet_mendelian | import "./tasks/quartet_mendelian.wdl" as quartet_mendelian | ||||
String SENTIEON_INSTALL_DIR | String SENTIEON_INSTALL_DIR | ||||
String SENTIEONdocker | String SENTIEONdocker | ||||
String fasta | String fasta | ||||
File ref_dir | File ref_dir | ||||
File benchmarking_dir | File benchmarking_dir | ||||
File del_bed | |||||
String project | String project | ||||
Array[File] family_vcfs = merge_family.family_vcf | Array[File] family_vcfs = merge_family.family_vcf | ||||
scatter (idx in range(length(family_vcfs))) { | scatter (idx in range(length(family_vcfs))) { | ||||
call filtered.filtered as filtered { | |||||
input: | |||||
raw_vcf=family_vcfs[idx], | |||||
del_bed=del_bed, | |||||
docker=BENCHMARKdocker, | |||||
cluster_config=BIGcluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call mendelian.mendelian as mendelian { | call mendelian.mendelian as mendelian { | ||||
input: | input: | ||||
family_vcf=family_vcfs[idx], | |||||
family_vcf=filtered.noDEL_vcf, | |||||
ref_dir=ref_dir, | ref_dir=ref_dir, | ||||
fasta=fasta, | fasta=fasta, | ||||
docker=MENDELIANdocker, | docker=MENDELIANdocker, |