소스 검색

整理信息

master
LUYAO REN 5 년 전
부모
커밋
9cd224dbd8
7개의 변경된 파일133개의 추가작업 그리고 22개의 파일을 삭제
  1. +65
    -0
      codescripts/extract_multiqc.py
  2. +0
    -1
      tasks/Haplotyper_gVCF.wdl
  3. +29
    -0
      tasks/extract_multiqc.wdl
  4. +11
    -12
      tasks/mendelian.wdl
  5. +7
    -7
      tasks/merge_mendelian.wdl
  6. +9
    -0
      tasks/multiqc.wdl
  7. +12
    -2
      workflow.wdl

+ 65
- 0
codescripts/extract_multiqc.py 파일 보기

import json
import pandas as pd
import sys, argparse, os

parser = argparse.ArgumentParser(description="This script is to get information from multiqc")

parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt', required=True)
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)

args = parser.parse_args()

# Rename input:
fastqc_qualimap_file = args.fastqc_qualimap
fastqc_file = args.fastqc
fastqscreen_file = args.fastqscreen
hap_file = args.happy


# fastqc and qualimap
dat = pd.read_table(fastqc_qualimap_file)

fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
fastqc_stat = fastqc.dropna()

# qulimap
qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
qualimap_stat = qualimap.dropna()

# fastqc
dat = pd.read_table(fastqc_file)

fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])

# fastqscreen
dat = pd.read_table(fastqscreen_file)
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])

# benchmark
with open(hap_file) as hap_json:
happy = json.load(hap_json)
dat =pd.DataFrame.from_records(happy)
dat = dat.loc[:, dat.columns.str.endswith('ALL')]
dat_transposed = dat.T
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
benchmark.columns = ['Sample','Precision','Recall']

#output
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)







+ 0
- 1
tasks/Haplotyper_gVCF.wdl 파일 보기

task Haplotyper_gVCF { task Haplotyper_gVCF {
File ref_dir File ref_dir
String SENTIEON_INSTALL_DIR String SENTIEON_INSTALL_DIR
String fasta String fasta

+ 29
- 0
tasks/extract_multiqc.wdl 파일 보기

task extract_multiqc {

File fastqc_qualimap
File fastqc
File fastqscreen
File hap

String docker
String cluster_config
String disk_size

command <<<
python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc_qualimap} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap}
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File fastqc_result = "fastqc.final.result.txt"
File fastqscreen_result = "fastqscreen.final.result.txt"
File qualimap_result = "qualimap.final.result.txt"
File hap_result = "benchmark.final.result.txt"
}
}

+ 11
- 12
tasks/mendelian.wdl 파일 보기

task mendelian { task mendelian {
File family_vcf File family_vcf
File ref_dir File ref_dir
String family_name = basename(family_vcf,".family.vcf")
String fasta String fasta
String project
String docker String docker
String cluster_config String cluster_config
String disk_size String disk_size
export LD_LIBRARY_PATH=/opt/htslib-1.9 export LD_LIBRARY_PATH=/opt/htslib-1.9
nt=$(nproc) nt=$(nproc)


echo -e "${project}\tLCL8\t0\t0\t2\t-9\n${project}\tLCL7\t0\t0\t1\t-9\n${project}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${project}.D5.ped
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped


mkdir VBT_D5 mkdir VBT_D5
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${project}.D5.ped -outDir VBT_D5 -out-prefix ${project}.D5 --output-violation-regions -thread-count $nt
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt


cat VBT_D5/${project}.D5_trio.vcf > ${project}.D5.vcf
cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf


echo -e "${project}\tLCL8\t0\t0\t2\t-9\n${project}\tLCL7\t0\t0\t1\t-9\n${project}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${project}.D6.ped
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped


mkdir VBT_D6 mkdir VBT_D6
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${project}.D6.ped -outDir VBT_D6 -out-prefix ${project}.D6 --output-violation-regions -thread-count $nt
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt


cat VBT_D6/${project}.D6_trio.vcf > ${project}.D6.vcf
cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf
>>> >>>


runtime { runtime {
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
} }
output { output {
File D5_ped = "${project}.D5.ped"
File D6_ped = "${project}.D6.ped"
File D5_ped = "${family_name}.D5.ped"
File D6_ped = "${family_name}.D6.ped"
Array[File] D5_mendelian = glob("VBT_D5/*") Array[File] D5_mendelian = glob("VBT_D5/*")
Array[File] D6_mendelian = glob("VBT_D6/*") Array[File] D6_mendelian = glob("VBT_D6/*")
File D5_trio_vcf = "${project}.D5.vcf"
File D6_trio_vcf = "${project}.D6.vcf"
File family_all_vcf = "${project}.vcf"
File D5_trio_vcf = "${family_name}.D5.vcf"
File D6_trio_vcf = "${family_name}.D6.vcf"
} }
} }



+ 7
- 7
tasks/merge_mendelian.wdl 파일 보기

File D5_trio_vcf File D5_trio_vcf
File D6_trio_vcf File D6_trio_vcf
File family_vcf File family_vcf
String project
String family_name = basename(family_vcf,".family.vcf")
String docker String docker
String cluster_config String cluster_config
String disk_size String disk_size
command <<< command <<<
cat ${D5_trio_vcf} | grep -v '##' > ${project}.D5.txt
cat ${D6_trio_vcf} | grep -v '##' > ${project}.D6.txt
cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt
cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt
cat ${family_vcf} | grep -v '##' | awk ' cat ${family_vcf} | grep -v '##' | awk '
BEGIN { OFS = "\t" } BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 { NF > 2 && FNR > 1 {
} }
} }
{ print } { print }
' > ${project}.consensus.txt
python /opt/merge_two_family_with_genotype.py -LCL5 ${project}.D5.txt -LCL6 ${project}.D6.txt -genotype ${project}.consensus.txt -family ${project}.mendelian
' > ${family_name}.consensus.txt
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name}
>>> >>>


runtime { runtime {
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
} }
output { output {
File project_mendelian = "${project}.mendelian.txt"
File project_mendelian_summary = "${project}.mendelian.summary.txt"
File project_mendelian = "${family_name}.mendelian.txt"
File project_mendelian_summary = "${family_name}.mendelian.summary.txt"
} }
} }

+ 9
- 0
tasks/multiqc.wdl 파일 보기

done done


multiqc /cromwell_root/tmp/ multiqc /cromwell_root/tmp/

cp multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt
cp multiqc_data/multiqc_fastqc.txt > multiqc_fastqc.txt
cp multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt
cp multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json
>>> >>>


output { output {
File multiqc_html = "multiqc_report.html" File multiqc_html = "multiqc_report.html"
Array[File] multiqc_txt = glob("multiqc_data/*") Array[File] multiqc_txt = glob("multiqc_data/*")
File fastqc_qualimap = "multiqc_general_stats.txt"
File fastqc = "multiqc_fastqc.txt"
File fastqscreen = "multiqc_fastq_screen.txt"
File hap = "multiqc_happy_data.json"
} }
} }

+ 12
- 2
workflow.wdl 파일 보기

import "./tasks/fastqc.wdl" as fastqc import "./tasks/fastqc.wdl" as fastqc
import "./tasks/fastqscreen.wdl" as fastqscreen import "./tasks/fastqscreen.wdl" as fastqscreen
import "./tasks/qualimap.wdl" as qualimap import "./tasks/qualimap.wdl" as qualimap
import "./tasks/extract_multiqc.wdl" as extract_multiqc


workflow {{ project_name }} { workflow {{ project_name }} {


disk_size=disk_size disk_size=disk_size
} }


call extract_multiqc.extract_multiqc as extract_multiqc {
input:
fastqc_qualimap=multiqc.fastqc_qualimap,
fastqc=multiqc.fastqc,
fastqscreen=multiqc.fastqscreen,
hap=multiqc.hap,
docker=DIYdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size
}

Array[File] family_vcfs = split_gvcf_files.family_vcf Array[File] family_vcfs = split_gvcf_files.family_vcf


scatter (idx in range(length(family_vcfs))) { scatter (idx in range(length(family_vcfs))) {
family_vcf=family_vcfs[idx], family_vcf=family_vcfs[idx],
ref_dir=ref_dir, ref_dir=ref_dir,
fasta=fasta, fasta=fasta,
project=project,
docker=MENDELIANdocker, docker=MENDELIANdocker,
cluster_config=BIGcluster_config, cluster_config=BIGcluster_config,
disk_size=disk_size disk_size=disk_size
D5_trio_vcf=mendelian.D5_trio_vcf, D5_trio_vcf=mendelian.D5_trio_vcf,
D6_trio_vcf=mendelian.D6_trio_vcf, D6_trio_vcf=mendelian.D6_trio_vcf,
family_vcf=family_vcfs[idx], family_vcf=family_vcfs[idx],
project=project,
docker=DIYdocker, docker=DIYdocker,
cluster_config=SMALLcluster_config, cluster_config=SMALLcluster_config,
disk_size=disk_size disk_size=disk_size

Loading…
취소
저장