瀏覽代碼

del bed

master
LUYAO REN 4 年之前
父節點
當前提交
ff1b236d59
共有 6 個檔案被更改,包括 45 行新增74 行删除
  1. +0
    -65
      codescripts/extract_multiqc.py
  2. +2
    -1
      inputs
  3. +4
    -4
      tasks/benchmark.wdl
  4. +27
    -0
      tasks/filtered.wdl
  5. +1
    -1
      tasks/mendelian.wdl
  6. +11
    -3
      workflow.wdl

+ 0
- 65
codescripts/extract_multiqc.py 查看文件

@@ -1,65 +0,0 @@
import json
import pandas as pd
import sys, argparse, os

parser = argparse.ArgumentParser(description="This script is to get information from multiqc")

parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt', required=True)
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)

args = parser.parse_args()

# Rename input:
fastqc_qualimap_file = args.fastqc_qualimap
fastqc_file = args.fastqc
fastqscreen_file = args.fastqscreen
hap_file = args.happy


# fastqc and qualimap
dat = pd.read_table(fastqc_qualimap_file)

fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
fastqc_stat = fastqc.dropna()

# qulimap
qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
qualimap_stat = qualimap.dropna()

# fastqc
dat = pd.read_table(fastqc_file)

fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])

# fastqscreen
dat = pd.read_table(fastqscreen_file)
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])

# benchmark
with open(hap_file) as hap_json:
happy = json.load(hap_json)
dat =pd.DataFrame.from_records(happy)
dat = dat.loc[:, dat.columns.str.endswith('ALL')]
dat_transposed = dat.T
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
benchmark.columns = ['Sample','Precision','Recall']

#output
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)







+ 2
- 1
inputs 查看文件

@@ -1,11 +1,12 @@
{
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript_v3.0/reference_dataset_v4.0/",
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript_v3.0/reference_dataset_v202011/",
"{{ project_name }}.SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics",
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest",
"{{ project_name }}.gvcf": {{ gvcf_list.split(";") | tojson }},
"{{ project_name }}.gvcf_idx": {{ gvcf_idx_list.split(";") | tojson }},
"{{ project_name }}.disk_size": "500",
"{{ project_name }}.del_bed": "oss://pgx-result/renluyao/manuscript_v3.0/reference_dataset_v202011/Tier1.DEL",
"{{ project_name }}.project": "{{ project }}",
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc",
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc",

+ 4
- 4
tasks/benchmark.wdl 查看文件

@@ -34,13 +34,13 @@ task benchmark {
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz

if [[ ${sample} =~ "LCL5" ]];then
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL5.voted.mendelian.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL5.highconfidence.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL5.ref.v20201103.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.callable.voted.collapse.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
elif [[ ${sample} =~ "LCL6" ]]; then
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL6.voted.mendelian.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL6.highconfidence.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL6.ref.v20201103.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.callable.voted.collapse.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
elif [[ ${sample} =~ "LCL7" ]]; then
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL7.voted.mendelian.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL7.highconfidence.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL7.ref.v20201103.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.callable.voted.collapse.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
elif [[ ${sample} =~ "LCL8" ]]; then
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL8.voted.mendelian.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/LCL8.highconfidence.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL8.ref.v20201103.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.callable.voted.collapse.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
else
echo "only for quartet samples"
fi

+ 27
- 0
tasks/filtered.wdl 查看文件

@@ -0,0 +1,27 @@
task filtered {
File raw_vcf
File del_bed
String family_name = basename(raw_vcf,".family.vcf")
String docker
String cluster_config
String disk_size

command <<<
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip ${raw_vcf} -c > ${family_name}.rtg.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${family_name}.rtg.vcf.gz

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${family_name}.rtg.vcf.gz -o ${family_name}.noDEL.vcf.gz --exclude-bed=${del_bed}

gunzip ${family_name}.noDEL.vcf.gz
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File noDEL_vcf="${family_name}.noDEL.vcf"
}
}

+ 1
- 1
tasks/mendelian.wdl 查看文件

@@ -1,7 +1,7 @@
task mendelian {
File family_vcf
File ref_dir
String family_name = basename(family_vcf,".family.vcf")
String family_name = basename(family_vcf,".noDEL.vcf")
String fasta
String docker
String cluster_config

+ 11
- 3
workflow.wdl 查看文件

@@ -1,6 +1,7 @@
import "./tasks/split_gvcf_files.wdl" as split_gvcf_files
import "./tasks/GVCFtyper.wdl" as GVCFtyper
import "./tasks/benchmark.wdl" as benchmark
import "./tasks/filtered.wdl" as filtered
import "./tasks/mendelian.wdl" as mendelian
import "./tasks/merge_mendelian.wdl" as merge_mendelian
import "./tasks/quartet_mendelian.wdl" as quartet_mendelian
@@ -19,11 +20,10 @@ workflow {{ project_name }} {
String SENTIEON_INSTALL_DIR
String SENTIEONdocker


String fasta
File ref_dir

File benchmarking_dir
File del_bed

String project

@@ -96,9 +96,17 @@ workflow {{ project_name }} {

Array[File] family_vcfs = merge_family.family_vcf
scatter (idx in range(length(family_vcfs))) {
call filtered.filtered as filtered {
input:
raw_vcf=family_vcfs[idx],
del_bed=del_bed,
docker=BENCHMARKdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size
}
call mendelian.mendelian as mendelian {
input:
family_vcf=family_vcfs[idx],
family_vcf=filtered.noDEL_vcf,
ref_dir=ref_dir,
fasta=fasta,
docker=MENDELIANdocker,

Loading…
取消
儲存