vor 5 Jahren · 9cd224dbd8
--- a/codescripts/extract_multiqc.py
+++ b/codescripts/extract_multiqc.py
@@ -0,0 +1,65 @@
 import json
 import pandas as pd
 import sys, argparse, os

 parser = argparse.ArgumentParser(description="This script is to get information from multiqc")

 parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt',  required=True)
 parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt',  required=True)
 parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt',  required=True)
 parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json',  required=True)

 args = parser.parse_args()

 # Rename input:
 fastqc_qualimap_file = args.fastqc_qualimap
 fastqc_file = args.fastqc
 fastqscreen_file = args.fastqscreen
 hap_file = args.happy


 # fastqc and qualimap
 dat = pd.read_table(fastqc_qualimap_file)

 fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
 fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
 fastqc_stat = fastqc.dropna()

 # qulimap
 qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
 qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
 qualimap_stat = qualimap.dropna()

 # fastqc
 dat = pd.read_table(fastqc_file)

 fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
 fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
 fastqc_all = pd.merge(fastqc_stat,fastqc_module,  how='outer', left_on=['Sample'], right_on = ['Sample'])

 # fastqscreen
 dat = pd.read_table(fastqscreen_file)
 fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
 dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
 fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])

 # benchmark
 with open(hap_file) as hap_json:
 	happy = json.load(hap_json)
 dat =pd.DataFrame.from_records(happy)
 dat = dat.loc[:, dat.columns.str.endswith('ALL')]
 dat_transposed = dat.T
 benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
 benchmark.columns = ['Sample','Precision','Recall']

 #output
 fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
 fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
 qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
 benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)






--- a/tasks/Haplotyper_gVCF.wdl
+++ b/tasks/Haplotyper_gVCF.wdl
@@ -1,6 +1,5 @@
 task Haplotyper_gVCF {
 	
 	
    File ref_dir
 	String SENTIEON_INSTALL_DIR
 	String fasta
--- a/tasks/extract_multiqc.wdl
+++ b/tasks/extract_multiqc.wdl
@@ -0,0 +1,29 @@
 task extract_multiqc {

 	File fastqc_qualimap
 	File fastqc
 	File fastqscreen
 	File hap

 	String docker
 	String cluster_config
 	String disk_size

 	command <<<
 		python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc_qualimap} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap}
 	>>>

 	runtime {
 		docker:docker
 		cluster:cluster_config
 		systemDisk:"cloud_ssd 40"
 		dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
 	}

 	output {
 		File fastqc_result = "fastqc.final.result.txt"
 		File fastqscreen_result = "fastqscreen.final.result.txt"
 		File qualimap_result = "qualimap.final.result.txt"
 		File hap_result = "benchmark.final.result.txt"
 	}
 }
--- a/tasks/mendelian.wdl
+++ b/tasks/mendelian.wdl
@@ -1,8 +1,8 @@
 task mendelian {
 	File family_vcf
 	File ref_dir
 	String family_name = basename(family_vcf,".family.vcf")
 	String fasta
 	String project
 	String docker
 	String cluster_config
 	String disk_size
@@ -11,19 +11,19 @@ task mendelian {
 		export LD_LIBRARY_PATH=/opt/htslib-1.9
 		nt=$(nproc)

 		echo -e "${project}\tLCL8\t0\t0\t2\t-9\n${project}\tLCL7\t0\t0\t1\t-9\n${project}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${project}.D5.ped
 		echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped

 		mkdir VBT_D5
 		/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${project}.D5.ped -outDir VBT_D5 -out-prefix ${project}.D5 --output-violation-regions -thread-count $nt
 		/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt

 		cat VBT_D5/${project}.D5_trio.vcf > ${project}.D5.vcf
 		cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf

 		echo -e "${project}\tLCL8\t0\t0\t2\t-9\n${project}\tLCL7\t0\t0\t1\t-9\n${project}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${project}.D6.ped
 		echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped

 		mkdir VBT_D6
 		/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${project}.D6.ped -outDir VBT_D6 -out-prefix ${project}.D6 --output-violation-regions -thread-count $nt
 		/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt

 		cat VBT_D6/${project}.D6_trio.vcf > ${project}.D6.vcf
 		cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf
 	>>>

 	runtime {
@@ -33,13 +33,12 @@ task mendelian {
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File D5_ped = "${project}.D5.ped"
 		File D6_ped = "${project}.D6.ped"
 		File D5_ped = "${family_name}.D5.ped"
 		File D6_ped = "${family_name}.D6.ped"
 		Array[File] D5_mendelian = glob("VBT_D5/*")
 		Array[File] D6_mendelian = glob("VBT_D6/*")
 		File D5_trio_vcf = "${project}.D5.vcf"
 		File D6_trio_vcf = "${project}.D6.vcf"
 		File family_all_vcf = "${project}.vcf"
 		File D5_trio_vcf = "${family_name}.D5.vcf"
 		File D6_trio_vcf = "${family_name}.D6.vcf"
 	}
 }

--- a/tasks/merge_mendelian.wdl
+++ b/tasks/merge_mendelian.wdl
@@ -2,14 +2,14 @@ task merge_mendelian {
 	File D5_trio_vcf
 	File D6_trio_vcf
 	File family_vcf
 	String project
 	String family_name = basename(family_vcf,".family.vcf")
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		cat ${D5_trio_vcf} | grep -v '##' > ${project}.D5.txt
 		cat ${D6_trio_vcf} | grep -v '##' > ${project}.D6.txt
 		cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt
 		cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt
 		cat ${family_vcf} | grep -v '##' | awk '
    		BEGIN { OFS = "\t" }
    		NF > 2 && FNR > 1 { 
@@ -18,8 +18,8 @@ task merge_mendelian {
        		} 
    		} 
    		{ print }
 			' > ${project}.consensus.txt
 		python /opt/merge_two_family_with_genotype.py -LCL5 ${project}.D5.txt -LCL6 ${project}.D6.txt -genotype ${project}.consensus.txt -family ${project}.mendelian
 			' > ${family_name}.consensus.txt
 		python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name}
 	>>>

 	runtime {
@@ -29,7 +29,7 @@ task merge_mendelian {
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File project_mendelian = "${project}.mendelian.txt"
 		File project_mendelian_summary = "${project}.mendelian.summary.txt"
 		File project_mendelian = "${family_name}.mendelian.txt"
 		File project_mendelian_summary = "${family_name}.mendelian.summary.txt"
 	}
 }
--- a/tasks/multiqc.wdl
+++ b/tasks/multiqc.wdl
@@ -30,6 +30,11 @@ task multiqc {
 		done

 		multiqc /cromwell_root/tmp/

 		cp multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt
 		cp multiqc_data/multiqc_fastqc.txt > multiqc_fastqc.txt
 		cp multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt
 		cp multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json
 	
 	>>>

@@ -43,5 +48,9 @@ task multiqc {
 	output {
 		File multiqc_html = "multiqc_report.html"
 		Array[File] multiqc_txt = glob("multiqc_data/*")
 		File fastqc_qualimap = "multiqc_general_stats.txt"
 		File fastqc = "multiqc_fastqc.txt"
 		File fastqscreen = "multiqc_fastq_screen.txt"
 		File hap = "multiqc_happy_data.json"
 	}
 }
--- a/workflow.wdl
+++ b/workflow.wdl
@@ -15,6 +15,7 @@ import "./tasks/quartet_mendelian.wdl" as quartet_mendelian
 import "./tasks/fastqc.wdl" as fastqc
 import "./tasks/fastqscreen.wdl" as fastqscreen
 import "./tasks/qualimap.wdl" as qualimap
 import "./tasks/extract_multiqc.wdl" as extract_multiqc

 workflow {{ project_name }} {

@@ -224,6 +225,17 @@ workflow {{ project_name }} {
 		disk_size=disk_size
 	}

 	call extract_multiqc.extract_multiqc as extract_multiqc {
 		input:
 		fastqc_qualimap=multiqc.fastqc_qualimap,
 		fastqc=multiqc.fastqc,
 		fastqscreen=multiqc.fastqscreen,
 		hap=multiqc.hap,
 		docker=DIYdocker,
 		cluster_config=SMALLcluster_config,
 		disk_size=disk_size
 	}

 	Array[File] family_vcfs = split_gvcf_files.family_vcf

 	scatter (idx in range(length(family_vcfs))) {
@@ -232,7 +244,6 @@ workflow {{ project_name }} {
 			family_vcf=family_vcfs[idx],
 			ref_dir=ref_dir,
 			fasta=fasta,
 			project=project,
 			docker=MENDELIANdocker,
 			cluster_config=BIGcluster_config,
 			disk_size=disk_size		
@@ -243,7 +254,6 @@ workflow {{ project_name }} {
 			D5_trio_vcf=mendelian.D5_trio_vcf,
 			D6_trio_vcf=mendelian.D6_trio_vcf,
 			family_vcf=family_vcfs[idx],
 			project=project,
 			docker=DIYdocker,
 			cluster_config=SMALLcluster_config,
 			disk_size=disk_size