пре 5 година · b531b59496
--- a/inputs
+++ b/inputs
@@ -0,0 +1,18 @@
 {
  "{{ project_name }}.benchmarking_dir": "oss://chinese-quartet/quartet-result-data/NCTR_benchmarking_20181215/",
  "{{ project_name }}.vcfstat.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest",
  "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
  "{{ project_name }}.benchmark.cluster_config": "OnDemand bcs.a2.3xlarge img-ubuntu-vpc",
  "{{ project_name }}.benchmark.disk_size": "150",
  "{{ project_name }}.vcfstat.disk_size": "100",
  "{{ project_name }}.benchmark.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest",
  "{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}",
  "{{ project_name }}.mergeNum.docker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/gatk:v2019.01",
  "{{ project_name }}.mergeNum.disk_size": "100",
  "{{ project_name }}.multiqc.cluster_config": "OnDemand bcs.a2.3xlarge img-ubuntu-vpc",
  "{{ project_name }}.multiqc.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/multiqc:v1.8",
  "{{ project_name }}.mergeNum.cluster_config": "OnDemand bcs.a2.large img-ubuntu-vpc",
  "{{ project_name }}.vcfstat.cluster_config": "OnDemand bcs.a2.3xlarge img-ubuntu-vpc",
  "{{ project_name }}.multiqc.disk_size": "100",
  "{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/"
 }
--- a/tasks/.DS_Store
+++ b/tasks/.DS_Store
--- a/tasks/benchmark.wdl
+++ b/tasks/benchmark.wdl
@@ -0,0 +1,64 @@
 task benchmark {
 	File vcf
 	File benchmarking_dir
 	File ref_dir
 	String sample = basename(vcf,".vcf")
 	String sample_mark
 	String fasta
 	String docker
 	String cluster_config
 	String disk_size


 	command <<<
 		set -o pipefail
 		set -e
 		nt=$(nproc)
 		mkdir -p /cromwell_root/tmp
 		cp -r ${ref_dir} /cromwell_root/tmp/

 		export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa

 		cat ${vcf} | grep '#' > header
 		cat ${vcf} | grep -v '#' > body
 		cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered
 		cat header body.filtered > ${sample}.filtered.vcf

 		/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip ${sample}.filtered.vcf -c > ${sample}.filtered.rtg.vcf.gz
 		/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.filtered.rtg.vcf.gz

 		if [ ${sample_mark} == "LCL5" ];then
 			/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL5.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL5.bed.gz --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
 	    elif [ ${sample_mark} == "LCL6" ]; then
 	    	/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL6.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL6.bed.gz --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
        elif [ ${sample_mark} == "LCL7" ]; then
        	/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL7.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL7.bed.gz --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
 	    elif [ ${sample_mark} == "LCL8" ]; then
 			/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL8.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL8.bed.gz --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
        else
        	echo "only for quartet samples"
        fi		
 	>>>

 	runtime {
 		docker:docker
 		cluster:cluster_config
 		systemDisk:"cloud_ssd 40"
 		dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
 	}

 	output {
 		File rtg_vcf = "${sample}.rtg.vcf.gz"
 		File rtg_vcf_index = "${sample}.rtg.vcf.gz.tbi"
 		File gzip_vcf = "${sample}.vcf.gz"
 		File gzip_vcf_index = "${sample}.vcf.gz.tbi"
 		File roc_all_csv = "${sample}.roc.all.csv.gz"
 		File roc_indel = "${sample}.roc.Locations.INDEL.csv.gz"
 		File roc_indel_pass = "${sample}.roc.Locations.INDEL.PASS.csv.gz"
 		File roc_snp = "${sample}.roc.Locations.SNP.csv.gz"
 		File roc_snp_pass = "${sample}.roc.Locations.SNP.PASS.csv.gz"
 		File summary = "${sample}.summary.csv"
 		File extended = "${sample}.extended.csv"
 		File metrics = "${sample}.metrics.json.gz"
 	}
 }
--- a/tasks/mergeNum.wdl
+++ b/tasks/mergeNum.wdl
@@ -0,0 +1,26 @@
 task mergeNum {
 	Array[File] vcfnumber
 	String docker
 	String cluster_config
 	String disk_size

 	command <<<
 		set -o pipefail
 		set -e
 		for i in ${sep=" " vcfnumber}
 		do
 		  cat $i | cut -d':' -f2 | tr '\n' '\t' | sed s'/\t$/\n/g' >> vcfstats
 		done
 		sed '1i\File\tFailed Filters\tPassed Filters\tSNPs\tMNPs\tInsertions\tDeletions\tIndels\tSame as reference\tSNP Transitions/Transversions\tTotal Het/Hom ratio\tSNP Het/Hom ratio\tMNP Het/Hom ratio\tInsertion Het/Hom ratio\tDeletion Het/Hom ratio\tIndel Het/Hom ratio\tInsertion/Deletion ratio\tIndel/SNP+MNP ratio' vcfstats > vcfstats.txt
 	>>>

 	runtime {
 		docker:docker
    	cluster:cluster_config
    	systemDisk:"cloud_ssd 40"
    	dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File vcfstat="vcfstats.txt"
 	}
 }
--- a/tasks/multiqc.wdl
+++ b/tasks/multiqc.wdl
@@ -0,0 +1,31 @@
 task multiqc {

 	Array[File] summary

 	String docker
 	String cluster_config
 	String disk_size

 	command <<<
 		set -o pipefail
 		set -e
 		mkdir -p /cromwell_root/tmp/benchmark

 		cp ${sep=" " summary} /cromwell_root/tmp/benchmark

 		multiqc /cromwell_root/tmp/
 	
 	>>>

 	runtime {
 		docker:docker
 		cluster:cluster_config
 		systemDisk:"cloud_ssd 40"
 		dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
 	}

 	output {
 		File multiqc_html = "multiqc_report.html"
 		Array[File] multiqc_txt = glob("multiqc_data/*")
 	}
 }
--- a/tasks/vcfstat.wdl
+++ b/tasks/vcfstat.wdl
@@ -0,0 +1,24 @@
 task vcfstat {
 	File rtg_vcf
 	File rtg_vcf_index
 	String docker
 	String cluster_config
 	String disk_size

 	command <<<
 		set -o pipefail
 		set -e

 		/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfstats ${rtg_vcf} > onestats.txt
 	>>>

 	runtime {
 		docker:docker
    	cluster:cluster_config
    	systemDisk:"cloud_ssd 40"
    	dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File vcfnumber="onestats.txt"
 	}
 }
--- a/workflow.wdl
+++ b/workflow.wdl
@@ -0,0 +1,44 @@
 import "./tasks/benchmark.wdl" as benchmark
 import "./tasks/multiqc.wdl" as multiqc
 import "./tasks/vcfstat.wdl" as vcfstat
 import "./tasks/mergeNum.wdl" as mergeNum

 workflow {{ project_name }} {

 	File inputSamplesFile
 	Array[Array[File]] inputSamples = read_tsv(inputSamplesFile)
 	File benchmarking_dir
 	File ref_dir
 	String fasta

 	scatter (sample in inputSamples) {

 		call benchmark.benchmark as benchmark {
 			input:
 			vcf=sample[0],
 			benchmarking_dir=benchmarking_dir,
 			ref_dir=ref_dir,
 			sample_mark=sample[1],
 			fasta=fasta
 		}

 		call vcfstat.vcfstat as vcfstat {
 			input:
 			rtg_vcf=benchmark.rtg_vcf,
 			rtg_vcf_index=benchmark.rtg_vcf_index
 		}	

 	}

 	call multiqc.multiqc as multiqc {
 		input:
 		summary=benchmark.summary
 	}

 	call mergeNum.mergeNum as mergeNum {
 		input:
 		vcfnumber=vcfstat.vcfnumber	
 	}

 }