Browse Source

sister and family info merge

master
LUYAO REN 5 years ago
parent
commit
1cea62fe3c
15 changed files with 382 additions and 415 deletions
  1. +139
    -0
      codescripts/reformVCF.py
  2. +17
    -6
      inputs
  3. +0
    -34
      tasks/ExtractVCFinfo.wdl
  4. +0
    -27
      tasks/KeepVar.wdl
  5. +0
    -69
      tasks/SepSnvIndel.wdl
  6. +0
    -71
      tasks/SepTrueFalse.wdl
  7. +10
    -19
      tasks/VCFrename.wdl
  8. +6
    -16
      tasks/mendelian.wdl
  9. +5
    -13
      tasks/merge.wdl
  10. +0
    -35
      tasks/mergeBed.wdl
  11. +34
    -0
      tasks/mergeSister.wdl
  12. +0
    -61
      tasks/mergeVCF.wdl
  13. +30
    -0
      tasks/reformVCF.wdl
  14. +6
    -17
      tasks/zipIndex.wdl
  15. +135
    -47
      workflow.wdl

+ 139
- 0
codescripts/reformVCF.py View File

@@ -0,0 +1,139 @@
# import modules
import sys, argparse, os
import fileinput
import re

parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style")

parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True)
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True)

args = parser.parse_args()

# Rename input:
inputFile = args.familyVCF
family_name = args.familyName

# output filename
LCL5_name = family_name + '.LCL5.vcf'
LCL5file = open(LCL5_name,'w')
LCL6_name = family_name + '.LCL6.vcf'
LCL6file = open(LCL6_name,'w')
LCL7_name = family_name + '.LCL7.vcf'
LCL7file = open(LCL7_name,'w')
LCL8_name = family_name + '.LCL8.vcf'
LCL8file = open(LCL8_name,'w')
family_filename = family_name + '.vcf'
familyfile = open(family_filename,'w')

# default columns, which will be included in the included in the calssifier
vcfheader = '''##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="Voted by at least two replicates, six callers and two sequencing sites">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##INFO=<ID=sister,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
##INFO=<ID=trioLCL5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##INFO=<ID=trioLCL6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# write VCF
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n'
LCL5file.write(vcfheader)
LCL5file.write(LCL5colname)

LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n'
LCL6file.write(vcfheader)
LCL6file.write(LCL6colname)

LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n'
LCL7file.write(vcfheader)
LCL7file.write(LCL7colname)

LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n'
LCL8file.write(vcfheader)
LCL8file.write(LCL8colname)

familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'_LCL8'+'\n'
familyfile.write(vcfheader)
familyfile.write(familycolname)

# reform VCF
def process(oneLine):
line = oneLine.rstrip()
strings = line.strip().split('\t')
# replace .
# LCL5 uniq
if strings[11] == '.':
strings[11] = '0/0'
strings[9] = strings[12]
strings[10] = strings[13]
else:
pass
# LCL6 uniq
if strings[14] == '.':
strings[14] = '0/0'
strings[12] = strings[9]
strings[13] = strings[10]
else:
pass
# sister
if strings[11] == strings[14]:
info = "sister=1"
else:
info = "sister=0"
# trioLCL5
if strings[15] == 'MD=1':
info = info + ";trioLCL5=1"
else:
info = info + ";trioLCL5=0"
# trioLCL6
if strings[7] == 'MD=1':
info = info + ";trioLCL6=1"
else:
info = info + ";trioLCL6=0"
# output LCL5
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[14] + '\n'
LCL5file.write(LCL5outLine)
# output LCL6
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[11] + '\n'
LCL6file.write(LCL6outLine)
# output LCL7
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[10] + '\n'
LCL7file.write(LCL7outLine)
# output LCL8
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[9] + '\n'
LCL8file.write(LCL8outLine)
# output family
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[14] + '\t' + strings[11] + '\t' + strings[10] + '\t' + strings[9] + '\n'
familyfile.write(familyoutLine)


for line in fileinput.input(inputFile):
m = re.match('^\#',line)
if m is not None:
pass
else:
process(line)


+ 17
- 6
inputs View File

@@ -1,17 +1,28 @@
{
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.sister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.mergeSister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.disk_size": "150",
"{{ project_name }}.merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}",
"{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.mapper_caller": "{{ mapper_caller }}",
"{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1",
"{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc",
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.LCL8variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/"
}


+ 0
- 34
tasks/ExtractVCFinfo.wdl View File

@@ -1,34 +0,0 @@
task ExtractVCFinfo {
File snv_train
File snv_test
File indel_train
File indel_test
String snv_train_sampleName = basename(snv_train,".vcf")
String snv_test_sampleName = basename(snv_test,".vcf")
String indel_train_sampleName = basename(indel_train,".vcf")
String indel_test_sampleName = basename(indel_test,".vcf")
String docker
String cluster_config
String disk_size
command <<<
python /opt/extract_vcf_information.py -i ${snv_train} -o ${snv_train_sampleName}.txt
python /opt/extract_vcf_information.py -i ${snv_test} -o ${snv_test_sampleName}.txt
python /opt/extract_vcf_information.py -i ${indel_train} -o ${indel_train_sampleName}.txt
python /opt/extract_vcf_information.py -i ${indel_test} -o ${indel_test_sampleName}.txt
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File snv_train_vcf = "${snv_train_sampleName}.txt"
File snv_test_vcf = "${snv_test_sampleName}.txt"
File indel_train_vcf = "${indel_train_sampleName}.txt"
File indel_test_vcf = "${indel_test_sampleName}.txt"
}
}

+ 0
- 27
tasks/KeepVar.wdl View File

@@ -1,27 +0,0 @@
task KeepVar {
File violation_merged_vcf
File consistent_merged_vcf
String docker
String cluster_config
String disk_size
command <<<
python /opt/select_small_variants_supported_by_all_callsets.py -i ${violation_merged_vcf} -o violation.all.selected

python /opt/select_small_variants_supported_by_all_callsets.py -i ${consistent_merged_vcf} -o consistent.all.selected
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File violation_keeped_vcf = "violation.all.selected.vcf"
File violation_outlier_vcf = "violation.all.selected_outlier.vcf"
File consistent_keeped_vcf = "consistent.all.selected.vcf"
File consistent_outlier_vcf = "consistent.all.selected_outlier.vcf"
}
}


+ 0
- 69
tasks/SepSnvIndel.wdl View File

@@ -1,69 +0,0 @@
task SepSnvIndel {
File vcf
String sampleName = basename(vcf,".normed.vcf")
File keeped_vcf
String docker
String cluster_config
String disk_size
command <<<

cat ${vcf} | grep '#' > header
cat ${vcf} | sed '/^#/d' | awk '$5!~/,/' > removed.body
cat ${vcf} | sed '/^#/d' | awk '$5~/,/' > MNP.body
cat header removed.body > ${sampleName}.MNPremoved.vcf
cat header MNP.body > ${sampleName}.MNP.vcf

rtg bgzip ${sampleName}.MNPremoved.vcf
rtg index -f vcf ${sampleName}.MNPremoved.vcf.gz


rtg bgzip ${keeped_vcf} -c > all.selected.vcf.gz
rtg index -f vcf all.selected.vcf.gz

rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.snv.train.vcf.gz --include-vcf=all.selected.vcf.gz --snps-only

rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.snv.test.vcf.gz --exclude-vcf=all.selected.vcf.gz --snps-only

rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.indel.train.vcf.gz --include-vcf=all.selected.vcf.gz --non-snps-only

rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.indel.test.vcf.gz --exclude-vcf=all.selected.vcf.gz --non-snps-only

rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.snv.vcf.gz --snps-only

rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.indel.vcf.gz --non-snps-only

gzip -d ${sampleName}.normed.snv.train.vcf.gz -c > ${sampleName}.normed.snv.train.vcf
gzip -d ${sampleName}.normed.snv.test.vcf.gz -c > ${sampleName}.normed.snv.test.vcf
gzip -d ${sampleName}.normed.indel.train.vcf.gz -c > ${sampleName}.normed.indel.train.vcf
gzip -d ${sampleName}.normed.indel.test.vcf.gz -c > ${sampleName}.normed.indel.test.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File MNP="${sampleName}.MNP.vcf"
File snv_gz = "${sampleName}.normed.snv.vcf.gz"
File snv_idx = "${sampleName}.normed.snv.vcf.gz.tbi"
File indel_gz = "${sampleName}.normed.indel.vcf.gz"
File indel_idx = "${sampleName}.normed.indel.vcf.gz.tbi"
File snv_train = "${sampleName}.normed.snv.train.vcf"
File snv_test = "${sampleName}.normed.snv.test.vcf"
File indel_train = "${sampleName}.normed.indel.train.vcf"
File indel_test = "${sampleName}.normed.indel.test.vcf"
File snv_train_gz = "${sampleName}.normed.snv.train.vcf.gz"
File snv_test_gz = "${sampleName}.normed.snv.test.vcf.gz"
File indel_train_gz = "${sampleName}.normed.indel.train.vcf.gz"
File indel_test_gz = "${sampleName}.normed.indel.test.vcf.gz"
File snv_train_idx = "${sampleName}.normed.snv.train.vcf.gz.tbi"
File snv_test_idx = "${sampleName}.normed.snv.test.vcf.gz.tbi"
File indel_train_idx = "${sampleName}.normed.indel.train.vcf.gz.tbi"
File indel_test_idx = "${sampleName}.normed.indel.test.vcf.gz.tbi"
}
}

+ 0
- 71
tasks/SepTrueFalse.wdl View File

@@ -1,71 +0,0 @@
task SepTrueFalse {
File snv_true_bed
File snv_false_bed
File indel_true_bed
File indel_false_bed
File snv_padding
File indel_padding

File snv_gz
File indel_gz
File snv_idx
File indel_idx
File snv_test_gz
File indel_test_gz
File snv_test_idx
File indel_test_idx

String sampleName = basename(snv_gz,".normed.snv.vcf.gz")
String docker
String cluster_config
String disk_size

command <<<

rtg vcffilter -i ${snv_test_gz} -o ${sampleName}.true.snv.vcf.gz --include-bed=${snv_true_bed}

rtg vcffilter -i ${snv_test_gz} -o ${sampleName}.false.snv.vcf.gz --include-bed=${snv_false_bed}

rtg vcffilter -i ${snv_gz} -o ${sampleName}.remain.snv.vcf.gz --exclude-bed=${snv_false_bed}

rtg vcffilter -i ${snv_gz} -o ${sampleName}.padding.snv.vcf.gz --include-bed=${snv_padding}

rtg vcffilter -i ${indel_test_gz} -o ${sampleName}.true.indel.vcf.gz --include-bed=${indel_true_bed}

rtg vcffilter -i ${indel_test_gz} -o ${sampleName}.false.indel.vcf.gz --include-bed=${indel_false_bed}

rtg vcffilter -i ${indel_gz} -o ${sampleName}.remain.indel.vcf.gz --exclude-bed=${indel_false_bed}

rtg vcffilter -i ${indel_gz} -o ${sampleName}.padding.indel.vcf.gz --include-bed=${indel_padding}

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File snv_true_vcf = "${sampleName}.true.snv.vcf.gz"
File snv_true_vcf_index = "${sampleName}.true.snv.vcf.gz.tbi"
File snv_false_vcf = "${sampleName}.false.snv.vcf.gz"
File snv_false_vcf_index = "${sampleName}.false.snv.vcf.gz.tbi"
File snv_remain_vcf = "${sampleName}.remain.snv.vcf.gz"
File snv_remain_vcf_index = "${sampleName}.remain.snv.vcf.gz.tbi"
File snv_padding_vcf = "${sampleName}.padding.snv.vcf.gz"
File snv_padding_vcf_index = "${sampleName}.padding.snv.vcf.gz.tbi"

File indel_true_vcf = "${sampleName}.true.indel.vcf.gz"
File indel_true_vcf_index = "${sampleName}.true.indel.vcf.gz.tbi"
File indel_false_vcf = "${sampleName}.false.indel.vcf.gz"
File indel_false_vcf_index = "${sampleName}.false.indel.vcf.gz.tbi"
File indel_remain_vcf = "${sampleName}.remain.indel.vcf.gz"
File indel_remain_vcf_index = "${sampleName}.remain.indel.vcf.gz.tbi"
File indel_padding_vcf = "${sampleName}.padding.indel.vcf.gz"
File indel_padding_vcf_index = "${sampleName}.padding.indel.vcf.gz.tbi"
}
}


+ 10
- 19
tasks/VCFrename.wdl View File

@@ -1,26 +1,21 @@
task VCFrename {
File mother_vcf_gz
File father_vcf_gz
File twins_vcf_gz
File mother_vcf_idx
File father_vcf_idx
File twins_vcf_idx
File trio_vcf_gz
File trio_vcf_idx
String mother_name
String father_name
String child_name
String family_name
String child
String docker
String cluster_config
String disk_size
command <<<
echo "MOTHER ${mother_name}" > mother_rename.txt
rtg vcfannotate -i ${mother_vcf_gz} -o ${mother_name}.rename.vcf.gz --relabel=mother_rename.txt
echo "MOTHER ${mother_name}.${child}
FATHER ${father_name}.${child}
CHILD ${child_name}" > rename.txt

echo "FATHER ${father_name}" > father_rename.txt
rtg vcfannotate -i ${father_vcf_gz} -o ${father_name}.rename.vcf.gz --relabel=father_rename.txt

echo "CHILD ${family_name}" > child_rename.txt
rtg vcfannotate -i ${twins_vcf_gz} -o ${family_name}.twins.rename.vcf.gz --relabel=child_rename.txt
rtg vcfannotate -i ${trio_vcf_gz} -o ${family_name}.${child}.rename.vcf.gz --relabel=rename.txt
>>>

runtime {
@@ -30,11 +25,7 @@ task VCFrename {
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File rename_mother_vcf_gz = "${mother_name}.rename.vcf.gz"
File rename_father_vcf_gz = "${father_name}.rename.vcf.gz"
File rename_twins_vcf_gz = "${family_name}.twins.rename.vcf.gz"
File rename_mother_vcf_idx = "${mother_name}.rename.vcf.gz.tbi"
File rename_father_vcf_idx = "${father_name}.rename.vcf.gz.tbi"
File rename_twins_vcf_idx = "${family_name}.twins.rename.vcf.gz.tbi"
File rename_trio_vcf_gz = "${family_name}.${child}.rename.vcf.gz"
File rename_trio_vcf_idx = "${family_name}.${child}.rename.vcf.gz.tbi"
}
}

+ 6
- 16
tasks/mendelian.wdl View File

@@ -1,10 +1,10 @@
task mendelian {
File sister_vcf
File child_vcf
File LCL7_vcf
File LCL8_vcf
String LCL7_name
String LCL8_name
String family_name
String child_name
File ref_dir
String fasta
String docker
@@ -13,20 +13,12 @@ task mendelian {
command <<<
export LD_LIBRARY_PATH=/opt/htslib-1.9
nt=$(nproc)
mkdir VBT

/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${sister_vcf} -outDir VBT -out-prefix ${family_name} --output-violation-regions

cat VBT/${family_name}_trio.vcf | grep '#' | cut -f1-9,10 > mother_header
cat VBT/${family_name}_trio.vcf | grep -v '#' | cut -f1-9,10 | grep 'MD=1' | grep -v '0/0' | cat mother_header - > ${LCL8_name}.sister.mendelian.gt.vcf

cat VBT/${family_name}_trio.vcf | grep '#' | cut -f1-9,11 > father_header
cat VBT/${family_name}_trio.vcf | grep -v '#' | cut -f1-9,11 | grep 'MD=1' | grep -v '0/0' | cat father_header - > ${LCL7_name}.sister.mendelian.gt.vcf

cat VBT/${family_name}_trio.vcf | grep '#' | cut -f1-9,12 > twin_header
cat VBT/${family_name}_trio.vcf | grep -v '#' | cut -f1-9,12 | grep 'MD=1' | grep -v '0/0' | cat twin_header - > ${family_name}.twins.sister.mendelian.gt.vcf
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${child_vcf} -outDir VBT -out-prefix ${child_name}.family --output-violation-regions -thread-count $nt

cat VBT/${child_name}.family_trio.vcf > ${child_name}.family.vcf
>>>

runtime {
@@ -37,8 +29,6 @@ task mendelian {
}
output {
Array[File] vbt_mendelian = glob("VBT/*")
File mother_vcf = "${LCL8_name}.sister.mendelian.gt.vcf"
File father_vcf = "${LCL7_name}.sister.mendelian.gt.vcf"
File twins_vcf = "${family_name}.twins.sister.mendelian.gt.vcf"
File trio_vcf = "${child_name}.family.vcf"
}
}

+ 5
- 13
tasks/merge.wdl View File

@@ -1,21 +1,15 @@
task merge {
Array[File] rename_mother_vcf_gz
Array[File] rename_mother_vcf_idx
Array[File] rename_father_vcf_gz
Array[File] rename_father_vcf_idx
Array[File] rename_twins_vcf_gz
Array[File] rename_twins_vcf_idx
String mapper_caller
Array[File] family_vcf_gz
Array[File] family_vcf_idx
String sample
String docker
String cluster_config
String disk_size
command <<<
rtg vcfmerge --force-merge-all --no-gzip -o LCL8.${mapper_caller}.sister.consistent.merged.vcf ${sep=" " rename_mother_vcf_gz}

rtg vcfmerge --force-merge-all --no-gzip -o LCL7.${mapper_caller}.sister.consistent.merged.vcf ${sep=" " rename_father_vcf_gz}
rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.vcf ${sep=" " family_vcf_gz}

rtg vcfmerge --force-merge-all --no-gzip -o Twins.${mapper_caller}.sister.consistent.merged.vcf ${sep=" " rename_twins_vcf_gz}
>>>

runtime {
@@ -25,8 +19,6 @@ task merge {
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File mother_merged_vcf = "LCL8.${mapper_caller}.sister.consistent.merged.vcf"
File father_merged_vcf = "LCL7.${mapper_caller}.sister.consistent.merged.vcf"
File twins_merged_vcf = "Twins.${mapper_caller}.sister.consistent.merged.vcf"
File merged_vcf = "${sample}.merged.vcf"
}
}

+ 0
- 35
tasks/mergeBed.wdl View File

@@ -1,35 +0,0 @@
task mergeBed {
Array[File] snv_true_bed
Array[File] snv_false_bed
Array[File] indel_true_bed
Array[File] indel_false_bed
Array[File] indel_padding
Array[File] snv_padding
String docker
String cluster_config
String disk_size
command <<<

/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_true_bed} ${sep=" " indel_true_bed} > merged.true.bed

/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_false_bed} ${sep=" " indel_false_bed} > merged.false.bed

/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_padding} ${sep=" " indel_padding} > merged.padding.bed

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File true_bed = "merged.true.bed"
File false_bed = "merged.false.bed"
File padding = "merged.padding.bed"
}
}


+ 34
- 0
tasks/mergeSister.wdl View File

@@ -0,0 +1,34 @@
task mergeSister {
File LCL5_trio_vcf_gz
File LCL5_trio_vcf_idx
File LCL6_trio_vcf_gz
File LCL6_trio_vcf_idx
String family_name
String docker
String cluster_config
String disk_size
command <<<
rtg vcfmerge -o LCL5.LCL6.merged.vcf.gz ${LCL5_trio_vcf_gz} ${LCL6_trio_vcf_gz}

rtg vcfmerge -o LCL6.LCL5.merged.vcf.gz ${LCL6_trio_vcf_gz} ${LCL5_trio_vcf_gz}

zcat LCL5.LCL6.merged.vcf.gz | grep '##' > header
zcat LCL5.LCL6.merged.vcf.gz | grep -v '##' | cut -f8 > LCL5.mendelian
zcat LCL6.LCL5.merged.vcf.gz | grep -v '##' | paste - LCL5.mendelian > body

cat header body > ${family_name}.trio.info.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File family_mendelian_info = "${family_name}.trio.info.vcf"
}

}

+ 0
- 61
tasks/mergeVCF.wdl View File

@@ -1,61 +0,0 @@
task mergeVCF {
Array[File] snv_true_vcf
Array[File] snv_true_vcf_index
Array[File] snv_false_vcf
Array[File] snv_false_vcf_index
Array[File] snv_remain_vcf
Array[File] snv_remain_vcf_index
Array[File] snv_padding_vcf
Array[File] snv_padding_vcf_index

Array[File] indel_true_vcf
Array[File] indel_true_vcf_index
Array[File] indel_false_vcf
Array[File] indel_false_vcf_index
Array[File] indel_remain_vcf
Array[File] indel_remain_vcf_index
Array[File] indel_padding_vcf
Array[File] indel_padding_vcf_index

String quartet_sample
String docker
String cluster_config
String disk_size
command <<<
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.true.vcf.gz ${sep=" " snv_true_vcf}

rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.false.vcf.gz ${sep=" " snv_false_vcf}

rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.remain.vcf.gz ${sep=" " snv_remain_vcf}

rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.padding.vcf.gz ${sep=" " snv_padding_vcf}

rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.true.vcf.gz ${sep=" " indel_true_vcf}

rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.false.vcf.gz ${sep=" " indel_false_vcf}

rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.remain.vcf.gz ${sep=" " indel_remain_vcf}

rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.padding.vcf.gz ${sep=" " indel_padding_vcf}
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File merged_snv_true = "${quartet_sample}.snv.true.vcf.gz"
File merged_snv_false = "${quartet_sample}.snv.false.vcf.gz"
File merged_snv_remain = "${quartet_sample}.snv.remain.vcf.gz"
File merged_snv_padding = "${quartet_sample}.snv.padding.vcf.gz"
File merged_indel_true = "${quartet_sample}.indel.true.vcf.gz"
File merged_indel_false = "${quartet_sample}.indel.false.vcf.gz"
File merged_indel_remain = "${quartet_sample}.indel.remain.vcf.gz"
File merged_indel_padding = "${quartet_sample}.indel.padding.vcf.gz"
}

}

+ 30
- 0
tasks/reformVCF.wdl View File

@@ -0,0 +1,30 @@
task reformVCF {
File family_mendelian_info
File family_name
String docker
String cluster_config
String disk_size
command <<<

python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name}

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File LCL5_family_info = "${family_name}.LCL5.vcf"
File LCL6_family_info = "${family_name}.LCL6.vcf"
File LCL7_family_info = "${family_name}.LCL7.vcf"
File LCL8_family_info = "${family_name}.LCL8.vcf"
File family_info = "${family_name}.vcf"
}
}


+ 6
- 17
tasks/zipIndex.wdl View File

@@ -1,21 +1,14 @@
task zipIndex {
File mother_vcf
File father_vcf
File twins_vcf
File vcf
String sample
String family_name
String docker
String cluster_config
String disk_size
command <<<
rtg bgzip ${mother_vcf} -c > ${family_name}.LCL8.vcf.gz
rtg index -f vcf ${family_name}.LCL8.vcf.gz

rtg bgzip ${father_vcf} -c > ${family_name}.LCL7.vcf.gz
rtg index -f vcf ${family_name}.LCL7.vcf.gz

rtg bgzip ${twins_vcf} -c > ${family_name}.twins.vcf.gz
rtg index -f vcf ${family_name}.twins.vcf.gz
rtg bgzip ${vcf} -c > ${family_name}.${sample}.vcf.gz
rtg index -f vcf ${family_name}.${sample}.vcf.gz

>>>

@@ -26,11 +19,7 @@ task zipIndex {
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File mother_vcf_gz = "${family_name}.LCL8.vcf.gz"
File father_vcf_gz = "${family_name}.LCL7.vcf.gz"
File twins_vcf_gz = "${family_name}.twins.vcf.gz"
File mother_vcf_idx = "${family_name}.LCL8.vcf.gz.tbi"
File father_vcf_idx = "${family_name}.LCL7.vcf.gz.tbi"
File twins_vcf_idx = "${family_name}.twins.vcf.gz.tbi"
File vcf_gz = "${family_name}.${sample}.vcf.gz"
File vcf_idx = "${family_name}.${sample}.vcf.gz.tbi"
}
}

+ 135
- 47
workflow.wdl View File

@@ -1,10 +1,11 @@
import "./tasks/variantsNorm.wdl" as variantsNorm
import "./tasks/sister.wdl" as sister
import "./tasks/mendelian.wdl" as mendelian
import "./tasks/zipIndex.wdl" as zipIndex
import "./tasks/VCFrename.wdl" as VCFrename
import "./tasks/mergeSister.wdl" as mergeSister
import "./tasks/reformVCF.wdl" as reformVCF
import "./tasks/merge.wdl" as merge
workflow {{ project_name }} {
File inputSamplesFile
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile)
@@ -12,102 +13,189 @@ workflow {{ project_name }} {
String fasta
String cluster_config
String disk_size
String mapper_caller

scatter (sample in inputSamples){
scatter (quartet in inputSamples){
call variantsNorm.variantsNorm as LCL5variantsNorm{
input:
vcf=sample[0],
vcf=quartet[0],
ref_dir=ref_dir,
fasta=fasta,
sampleName=sample[4],
sampleName=quartet[4],
cluster_config=cluster_config,
disk_size=disk_size
}
call variantsNorm.variantsNorm as LCL6variantsNorm{
input:
vcf=sample[1],
vcf=quartet[1],
ref_dir=ref_dir,
fasta=fasta,
sampleName=sample[5],
sampleName=quartet[5],
cluster_config=cluster_config,
disk_size=disk_size
}
call variantsNorm.variantsNorm as LCL7variantsNorm{
input:
vcf=sample[2],
vcf=quartet[2],
ref_dir=ref_dir,
fasta=fasta,
sampleName=sample[6],
sampleName=quartet[6],
cluster_config=cluster_config,
disk_size=disk_size
}
call variantsNorm.variantsNorm as LCL8variantsNorm{
input:
vcf=sample[3],
vcf=quartet[3],
ref_dir=ref_dir,
fasta=fasta,
sampleName=sample[7],
sampleName=quartet[7],
cluster_config=cluster_config,
disk_size=disk_size
}
call sister.sister as sister{
call mendelian.mendelian as LCL5mendelian {
input:
LCL5_vcf=LCL5variantsNorm.normed_vcf,
LCL6_vcf=LCL6variantsNorm.normed_vcf,
child_vcf=LCL5variantsNorm.normed_vcf,
LCL7_vcf=LCL7variantsNorm.normed_vcf,
LCL8_vcf=LCL8variantsNorm.normed_vcf,
LCL7_name=quartet[6],
LCL8_name=quartet[7],
child_name=quartet[4],
ref_dir=ref_dir,
fasta=fasta,
family_name=sample[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call mendelian.mendelian as mendelian {
input:
sister_vcf=sister.sister_consistent_vcf,
call mendelian.mendelian as LCL6mendelian {
input:
child_vcf=LCL6variantsNorm.normed_vcf,
LCL7_vcf=LCL7variantsNorm.normed_vcf,
LCL8_vcf=LCL8variantsNorm.normed_vcf,
LCL7_name=sample[6],
LCL8_name=sample[7],
family_name=sample[8],
LCL7_name=quartet[6],
LCL8_name=quartet[7],
child_name=quartet[5],
ref_dir=ref_dir,
fasta=fasta,
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as zipIndex{
call zipIndex.zipIndex as LCL5zipIndex {
input:
vcf=LCL5mendelian.trio_vcf,
sample="LCL5",
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL6zipIndex {
input:
vcf=LCL6mendelian.trio_vcf,
sample="LCL6",
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call VCFrename.VCFrename as LCL5VCFrename {
input:
trio_vcf_gz=LCL5zipIndex.vcf_gz,
trio_vcf_idx=LCL5zipIndex.vcf_idx,
mother_name=quartet[7],
father_name=quartet[6],
child_name=quartet[4],
family_name=quartet[8],
child="LCL5",
cluster_config=cluster_config,
disk_size=disk_size
}
call VCFrename.VCFrename as LCL6VCFrename {
input:
trio_vcf_gz=LCL6zipIndex.vcf_gz,
trio_vcf_idx=LCL6zipIndex.vcf_idx,
mother_name=quartet[7],
father_name=quartet[6],
child_name=quartet[5],
family_name=quartet[8],
child="LCL6",
cluster_config=cluster_config,
disk_size=disk_size
}
call mergeSister.mergeSister as mergeSister {
input:
mother_vcf=mendelian.mother_vcf,
father_vcf=mendelian.father_vcf,
twins_vcf=mendelian.twins_vcf,
family_name=sample[8],
LCL5_trio_vcf_gz=LCL5VCFrename.rename_trio_vcf_gz,
LCL5_trio_vcf_idx=LCL5VCFrename.rename_trio_vcf_idx,
LCL6_trio_vcf_gz=LCL6VCFrename.rename_trio_vcf_gz,
LCL6_trio_vcf_idx=LCL6VCFrename.rename_trio_vcf_idx,
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call VCFrename.VCFrename as VCFrename{
input:
mother_vcf_gz=zipIndex.mother_vcf_gz,
father_vcf_gz=zipIndex.father_vcf_gz,
twins_vcf_gz=zipIndex.twins_vcf_gz,
mother_vcf_idx=zipIndex.mother_vcf_idx,
father_vcf_idx=zipIndex.father_vcf_idx,
twins_vcf_idx=zipIndex.twins_vcf_idx,
mother_name=sample[7],
father_name=sample[6],
family_name=sample[8],
call reformVCF.reformVCF as reformVCF {
input:
family_mendelian_info=mergeSister.family_mendelian_info,
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL5familyzipIndex {
input:
vcf=reformVCF.LCL5_family_info,
sample='LCL5',
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL6familyzipIndex {
input:
vcf=reformVCF.LCL6_family_info,
sample='LCL6',
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL7familyzipIndex {
input:
vcf=reformVCF.LCL7_family_info,
sample='LCL7',
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL8familyzipIndex {
input:
vcf=reformVCF.LCL8_family_info,
sample='LCL8',
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
}

call merge.merge as merge {
call merge.merge as LCL5merge {
input:
family_vcf_gz=LCL5familyzipIndex.vcf_gz,
family_vcf_idx=LCL5familyzipIndex.vcf_idx,
sample="LCL5",
cluster_config=cluster_config,
disk_size=disk_size
}
call merge.merge as LCL6merge {
input:
family_vcf_gz=LCL6familyzipIndex.vcf_gz,
family_vcf_idx=LCL6familyzipIndex.vcf_idx,
sample="LCL6",
cluster_config=cluster_config,
disk_size=disk_size
}
call merge.merge as LCL7merge {
input:
family_vcf_gz=LCL7familyzipIndex.vcf_gz,
family_vcf_idx=LCL7familyzipIndex.vcf_idx,
sample="LCL7",
cluster_config=cluster_config,
disk_size=disk_size
}
call merge.merge as LCL8merge {
input:
rename_mother_vcf_gz=VCFrename.rename_mother_vcf_gz,
rename_mother_vcf_idx=VCFrename.rename_mother_vcf_idx,
rename_father_vcf_gz=VCFrename.rename_father_vcf_gz,
rename_father_vcf_idx=VCFrename.rename_father_vcf_idx,
rename_twins_vcf_gz=VCFrename.rename_twins_vcf_gz,
rename_twins_vcf_idx=VCFrename.rename_twins_vcf_idx,
mapper_caller=mapper_caller,
family_vcf_gz=LCL8familyzipIndex.vcf_gz,
family_vcf_idx=LCL8familyzipIndex.vcf_idx,
sample="LCL8",
cluster_config=cluster_config,
disk_size=disk_size
}

Loading…
Cancel
Save