Parcourir la source

整理信息

master
LUYAO REN il y a 5 ans
Parent
révision
9cd224dbd8
7 fichiers modifiés avec 133 ajouts et 22 suppressions
  1. +65
    -0
      codescripts/extract_multiqc.py
  2. +0
    -1
      tasks/Haplotyper_gVCF.wdl
  3. +29
    -0
      tasks/extract_multiqc.wdl
  4. +11
    -12
      tasks/mendelian.wdl
  5. +7
    -7
      tasks/merge_mendelian.wdl
  6. +9
    -0
      tasks/multiqc.wdl
  7. +12
    -2
      workflow.wdl

+ 65
- 0
codescripts/extract_multiqc.py Voir le fichier

@@ -0,0 +1,65 @@
import json
import pandas as pd
import sys, argparse, os

parser = argparse.ArgumentParser(description="This script is to get information from multiqc")

parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt', required=True)
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)

args = parser.parse_args()

# Rename input:
fastqc_qualimap_file = args.fastqc_qualimap
fastqc_file = args.fastqc
fastqscreen_file = args.fastqscreen
hap_file = args.happy


# fastqc and qualimap
dat = pd.read_table(fastqc_qualimap_file)

fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
fastqc_stat = fastqc.dropna()

# qulimap
qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
qualimap_stat = qualimap.dropna()

# fastqc
dat = pd.read_table(fastqc_file)

fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])

# fastqscreen
dat = pd.read_table(fastqscreen_file)
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])

# benchmark
with open(hap_file) as hap_json:
happy = json.load(hap_json)
dat =pd.DataFrame.from_records(happy)
dat = dat.loc[:, dat.columns.str.endswith('ALL')]
dat_transposed = dat.T
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
benchmark.columns = ['Sample','Precision','Recall']

#output
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)







+ 0
- 1
tasks/Haplotyper_gVCF.wdl Voir le fichier

@@ -1,6 +1,5 @@
task Haplotyper_gVCF {
File ref_dir
String SENTIEON_INSTALL_DIR
String fasta

+ 29
- 0
tasks/extract_multiqc.wdl Voir le fichier

@@ -0,0 +1,29 @@
task extract_multiqc {

File fastqc_qualimap
File fastqc
File fastqscreen
File hap

String docker
String cluster_config
String disk_size

command <<<
python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc_qualimap} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap}
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File fastqc_result = "fastqc.final.result.txt"
File fastqscreen_result = "fastqscreen.final.result.txt"
File qualimap_result = "qualimap.final.result.txt"
File hap_result = "benchmark.final.result.txt"
}
}

+ 11
- 12
tasks/mendelian.wdl Voir le fichier

@@ -1,8 +1,8 @@
task mendelian {
File family_vcf
File ref_dir
String family_name = basename(family_vcf,".family.vcf")
String fasta
String project
String docker
String cluster_config
String disk_size
@@ -11,19 +11,19 @@ task mendelian {
export LD_LIBRARY_PATH=/opt/htslib-1.9
nt=$(nproc)

echo -e "${project}\tLCL8\t0\t0\t2\t-9\n${project}\tLCL7\t0\t0\t1\t-9\n${project}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${project}.D5.ped
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped

mkdir VBT_D5
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${project}.D5.ped -outDir VBT_D5 -out-prefix ${project}.D5 --output-violation-regions -thread-count $nt
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt

cat VBT_D5/${project}.D5_trio.vcf > ${project}.D5.vcf
cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf

echo -e "${project}\tLCL8\t0\t0\t2\t-9\n${project}\tLCL7\t0\t0\t1\t-9\n${project}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${project}.D6.ped
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped

mkdir VBT_D6
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${project}.D6.ped -outDir VBT_D6 -out-prefix ${project}.D6 --output-violation-regions -thread-count $nt
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt

cat VBT_D6/${project}.D6_trio.vcf > ${project}.D6.vcf
cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf
>>>

runtime {
@@ -33,13 +33,12 @@ task mendelian {
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File D5_ped = "${project}.D5.ped"
File D6_ped = "${project}.D6.ped"
File D5_ped = "${family_name}.D5.ped"
File D6_ped = "${family_name}.D6.ped"
Array[File] D5_mendelian = glob("VBT_D5/*")
Array[File] D6_mendelian = glob("VBT_D6/*")
File D5_trio_vcf = "${project}.D5.vcf"
File D6_trio_vcf = "${project}.D6.vcf"
File family_all_vcf = "${project}.vcf"
File D5_trio_vcf = "${family_name}.D5.vcf"
File D6_trio_vcf = "${family_name}.D6.vcf"
}
}


+ 7
- 7
tasks/merge_mendelian.wdl Voir le fichier

@@ -2,14 +2,14 @@ task merge_mendelian {
File D5_trio_vcf
File D6_trio_vcf
File family_vcf
String project
String family_name = basename(family_vcf,".family.vcf")
String docker
String cluster_config
String disk_size
command <<<
cat ${D5_trio_vcf} | grep -v '##' > ${project}.D5.txt
cat ${D6_trio_vcf} | grep -v '##' > ${project}.D6.txt
cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt
cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt
cat ${family_vcf} | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
@@ -18,8 +18,8 @@ task merge_mendelian {
}
}
{ print }
' > ${project}.consensus.txt
python /opt/merge_two_family_with_genotype.py -LCL5 ${project}.D5.txt -LCL6 ${project}.D6.txt -genotype ${project}.consensus.txt -family ${project}.mendelian
' > ${family_name}.consensus.txt
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name}
>>>

runtime {
@@ -29,7 +29,7 @@ task merge_mendelian {
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File project_mendelian = "${project}.mendelian.txt"
File project_mendelian_summary = "${project}.mendelian.summary.txt"
File project_mendelian = "${family_name}.mendelian.txt"
File project_mendelian_summary = "${family_name}.mendelian.summary.txt"
}
}

+ 9
- 0
tasks/multiqc.wdl Voir le fichier

@@ -30,6 +30,11 @@ task multiqc {
done

multiqc /cromwell_root/tmp/

cp multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt
cp multiqc_data/multiqc_fastqc.txt > multiqc_fastqc.txt
cp multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt
cp multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json
>>>

@@ -43,5 +48,9 @@ task multiqc {
output {
File multiqc_html = "multiqc_report.html"
Array[File] multiqc_txt = glob("multiqc_data/*")
File fastqc_qualimap = "multiqc_general_stats.txt"
File fastqc = "multiqc_fastqc.txt"
File fastqscreen = "multiqc_fastq_screen.txt"
File hap = "multiqc_happy_data.json"
}
}

+ 12
- 2
workflow.wdl Voir le fichier

@@ -15,6 +15,7 @@ import "./tasks/quartet_mendelian.wdl" as quartet_mendelian
import "./tasks/fastqc.wdl" as fastqc
import "./tasks/fastqscreen.wdl" as fastqscreen
import "./tasks/qualimap.wdl" as qualimap
import "./tasks/extract_multiqc.wdl" as extract_multiqc

workflow {{ project_name }} {

@@ -224,6 +225,17 @@ workflow {{ project_name }} {
disk_size=disk_size
}

call extract_multiqc.extract_multiqc as extract_multiqc {
input:
fastqc_qualimap=multiqc.fastqc_qualimap,
fastqc=multiqc.fastqc,
fastqscreen=multiqc.fastqscreen,
hap=multiqc.hap,
docker=DIYdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size
}

Array[File] family_vcfs = split_gvcf_files.family_vcf

scatter (idx in range(length(family_vcfs))) {
@@ -232,7 +244,6 @@ workflow {{ project_name }} {
family_vcf=family_vcfs[idx],
ref_dir=ref_dir,
fasta=fasta,
project=project,
docker=MENDELIANdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size
@@ -243,7 +254,6 @@ workflow {{ project_name }} {
D5_trio_vcf=mendelian.D5_trio_vcf,
D6_trio_vcf=mendelian.D6_trio_vcf,
family_vcf=family_vcfs[idx],
project=project,
docker=DIYdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size

Chargement…
Annuler
Enregistrer