@@ -0,0 +1,139 @@ | |||
# import modules | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style") | |||
parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True) | |||
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
inputFile = args.familyVCF | |||
family_name = args.familyName | |||
# output filename | |||
LCL5_name = family_name + '.LCL5.vcf' | |||
LCL5file = open(LCL5_name,'w') | |||
LCL6_name = family_name + '.LCL6.vcf' | |||
LCL6file = open(LCL6_name,'w') | |||
LCL7_name = family_name + '.LCL7.vcf' | |||
LCL7file = open(LCL7_name,'w') | |||
LCL8_name = family_name + '.LCL8.vcf' | |||
LCL8file = open(LCL8_name,'w') | |||
family_filename = family_name + '.vcf' | |||
familyfile = open(family_filename,'w') | |||
# default columns, which will be included in the included in the calssifier | |||
vcfheader = '''##fileformat=VCFv4.2 | |||
##FILTER=<ID=PASS,Description="Voted by at least two replicates, six callers and two sequencing sites"> | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##INFO=<ID=sister,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent"> | |||
##INFO=<ID=trioLCL5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||
##INFO=<ID=trioLCL6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
# write VCF | |||
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n' | |||
LCL5file.write(vcfheader) | |||
LCL5file.write(LCL5colname) | |||
LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n' | |||
LCL6file.write(vcfheader) | |||
LCL6file.write(LCL6colname) | |||
LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n' | |||
LCL7file.write(vcfheader) | |||
LCL7file.write(LCL7colname) | |||
LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n' | |||
LCL8file.write(vcfheader) | |||
LCL8file.write(LCL8colname) | |||
familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'_LCL8'+'\n' | |||
familyfile.write(vcfheader) | |||
familyfile.write(familycolname) | |||
# reform VCF | |||
def process(oneLine): | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
# replace . | |||
# LCL5 uniq | |||
if strings[11] == '.': | |||
strings[11] = '0/0' | |||
strings[9] = strings[12] | |||
strings[10] = strings[13] | |||
else: | |||
pass | |||
# LCL6 uniq | |||
if strings[14] == '.': | |||
strings[14] = '0/0' | |||
strings[12] = strings[9] | |||
strings[13] = strings[10] | |||
else: | |||
pass | |||
# sister | |||
if strings[11] == strings[14]: | |||
info = "sister=1" | |||
else: | |||
info = "sister=0" | |||
# trioLCL5 | |||
if strings[15] == 'MD=1': | |||
info = info + ";trioLCL5=1" | |||
else: | |||
info = info + ";trioLCL5=0" | |||
# trioLCL6 | |||
if strings[7] == 'MD=1': | |||
info = info + ";trioLCL6=1" | |||
else: | |||
info = info + ";trioLCL6=0" | |||
# output LCL5 | |||
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[14] + '\n' | |||
LCL5file.write(LCL5outLine) | |||
# output LCL6 | |||
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[11] + '\n' | |||
LCL6file.write(LCL6outLine) | |||
# output LCL7 | |||
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[10] + '\n' | |||
LCL7file.write(LCL7outLine) | |||
# output LCL8 | |||
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[9] + '\n' | |||
LCL8file.write(LCL8outLine) | |||
# output family | |||
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[14] + '\t' + strings[11] + '\t' + strings[10] + '\t' + strings[9] + '\n' | |||
familyfile.write(familyoutLine) | |||
for line in fileinput.input(inputFile): | |||
m = re.match('^\#',line) | |||
if m is not None: | |||
pass | |||
else: | |||
process(line) | |||
@@ -1,17 +1,28 @@ | |||
{ | |||
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||
"{{ project_name }}.VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.sister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"{{ project_name }}.mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"{{ project_name }}.mergeSister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"{{ project_name }}.disk_size": "150", | |||
"{{ project_name }}.merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | |||
"{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.mapper_caller": "{{ mapper_caller }}", | |||
"{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||
"{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc", | |||
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.LCL8variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | |||
} | |||
@@ -1,34 +0,0 @@ | |||
task ExtractVCFinfo { | |||
File snv_train | |||
File snv_test | |||
File indel_train | |||
File indel_test | |||
String snv_train_sampleName = basename(snv_train,".vcf") | |||
String snv_test_sampleName = basename(snv_test,".vcf") | |||
String indel_train_sampleName = basename(indel_train,".vcf") | |||
String indel_test_sampleName = basename(indel_test,".vcf") | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/extract_vcf_information.py -i ${snv_train} -o ${snv_train_sampleName}.txt | |||
python /opt/extract_vcf_information.py -i ${snv_test} -o ${snv_test_sampleName}.txt | |||
python /opt/extract_vcf_information.py -i ${indel_train} -o ${indel_train_sampleName}.txt | |||
python /opt/extract_vcf_information.py -i ${indel_test} -o ${indel_test_sampleName}.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File snv_train_vcf = "${snv_train_sampleName}.txt" | |||
File snv_test_vcf = "${snv_test_sampleName}.txt" | |||
File indel_train_vcf = "${indel_train_sampleName}.txt" | |||
File indel_test_vcf = "${indel_test_sampleName}.txt" | |||
} | |||
} |
@@ -1,27 +0,0 @@ | |||
task KeepVar { | |||
File violation_merged_vcf | |||
File consistent_merged_vcf | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/select_small_variants_supported_by_all_callsets.py -i ${violation_merged_vcf} -o violation.all.selected | |||
python /opt/select_small_variants_supported_by_all_callsets.py -i ${consistent_merged_vcf} -o consistent.all.selected | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File violation_keeped_vcf = "violation.all.selected.vcf" | |||
File violation_outlier_vcf = "violation.all.selected_outlier.vcf" | |||
File consistent_keeped_vcf = "consistent.all.selected.vcf" | |||
File consistent_outlier_vcf = "consistent.all.selected_outlier.vcf" | |||
} | |||
} | |||
@@ -1,69 +0,0 @@ | |||
task SepSnvIndel { | |||
File vcf | |||
String sampleName = basename(vcf,".normed.vcf") | |||
File keeped_vcf | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
cat ${vcf} | grep '#' > header | |||
cat ${vcf} | sed '/^#/d' | awk '$5!~/,/' > removed.body | |||
cat ${vcf} | sed '/^#/d' | awk '$5~/,/' > MNP.body | |||
cat header removed.body > ${sampleName}.MNPremoved.vcf | |||
cat header MNP.body > ${sampleName}.MNP.vcf | |||
rtg bgzip ${sampleName}.MNPremoved.vcf | |||
rtg index -f vcf ${sampleName}.MNPremoved.vcf.gz | |||
rtg bgzip ${keeped_vcf} -c > all.selected.vcf.gz | |||
rtg index -f vcf all.selected.vcf.gz | |||
rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.snv.train.vcf.gz --include-vcf=all.selected.vcf.gz --snps-only | |||
rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.snv.test.vcf.gz --exclude-vcf=all.selected.vcf.gz --snps-only | |||
rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.indel.train.vcf.gz --include-vcf=all.selected.vcf.gz --non-snps-only | |||
rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.indel.test.vcf.gz --exclude-vcf=all.selected.vcf.gz --non-snps-only | |||
rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.snv.vcf.gz --snps-only | |||
rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.indel.vcf.gz --non-snps-only | |||
gzip -d ${sampleName}.normed.snv.train.vcf.gz -c > ${sampleName}.normed.snv.train.vcf | |||
gzip -d ${sampleName}.normed.snv.test.vcf.gz -c > ${sampleName}.normed.snv.test.vcf | |||
gzip -d ${sampleName}.normed.indel.train.vcf.gz -c > ${sampleName}.normed.indel.train.vcf | |||
gzip -d ${sampleName}.normed.indel.test.vcf.gz -c > ${sampleName}.normed.indel.test.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File MNP="${sampleName}.MNP.vcf" | |||
File snv_gz = "${sampleName}.normed.snv.vcf.gz" | |||
File snv_idx = "${sampleName}.normed.snv.vcf.gz.tbi" | |||
File indel_gz = "${sampleName}.normed.indel.vcf.gz" | |||
File indel_idx = "${sampleName}.normed.indel.vcf.gz.tbi" | |||
File snv_train = "${sampleName}.normed.snv.train.vcf" | |||
File snv_test = "${sampleName}.normed.snv.test.vcf" | |||
File indel_train = "${sampleName}.normed.indel.train.vcf" | |||
File indel_test = "${sampleName}.normed.indel.test.vcf" | |||
File snv_train_gz = "${sampleName}.normed.snv.train.vcf.gz" | |||
File snv_test_gz = "${sampleName}.normed.snv.test.vcf.gz" | |||
File indel_train_gz = "${sampleName}.normed.indel.train.vcf.gz" | |||
File indel_test_gz = "${sampleName}.normed.indel.test.vcf.gz" | |||
File snv_train_idx = "${sampleName}.normed.snv.train.vcf.gz.tbi" | |||
File snv_test_idx = "${sampleName}.normed.snv.test.vcf.gz.tbi" | |||
File indel_train_idx = "${sampleName}.normed.indel.train.vcf.gz.tbi" | |||
File indel_test_idx = "${sampleName}.normed.indel.test.vcf.gz.tbi" | |||
} | |||
} |
@@ -1,71 +0,0 @@ | |||
task SepTrueFalse { | |||
File snv_true_bed | |||
File snv_false_bed | |||
File indel_true_bed | |||
File indel_false_bed | |||
File snv_padding | |||
File indel_padding | |||
File snv_gz | |||
File indel_gz | |||
File snv_idx | |||
File indel_idx | |||
File snv_test_gz | |||
File indel_test_gz | |||
File snv_test_idx | |||
File indel_test_idx | |||
String sampleName = basename(snv_gz,".normed.snv.vcf.gz") | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcffilter -i ${snv_test_gz} -o ${sampleName}.true.snv.vcf.gz --include-bed=${snv_true_bed} | |||
rtg vcffilter -i ${snv_test_gz} -o ${sampleName}.false.snv.vcf.gz --include-bed=${snv_false_bed} | |||
rtg vcffilter -i ${snv_gz} -o ${sampleName}.remain.snv.vcf.gz --exclude-bed=${snv_false_bed} | |||
rtg vcffilter -i ${snv_gz} -o ${sampleName}.padding.snv.vcf.gz --include-bed=${snv_padding} | |||
rtg vcffilter -i ${indel_test_gz} -o ${sampleName}.true.indel.vcf.gz --include-bed=${indel_true_bed} | |||
rtg vcffilter -i ${indel_test_gz} -o ${sampleName}.false.indel.vcf.gz --include-bed=${indel_false_bed} | |||
rtg vcffilter -i ${indel_gz} -o ${sampleName}.remain.indel.vcf.gz --exclude-bed=${indel_false_bed} | |||
rtg vcffilter -i ${indel_gz} -o ${sampleName}.padding.indel.vcf.gz --include-bed=${indel_padding} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File snv_true_vcf = "${sampleName}.true.snv.vcf.gz" | |||
File snv_true_vcf_index = "${sampleName}.true.snv.vcf.gz.tbi" | |||
File snv_false_vcf = "${sampleName}.false.snv.vcf.gz" | |||
File snv_false_vcf_index = "${sampleName}.false.snv.vcf.gz.tbi" | |||
File snv_remain_vcf = "${sampleName}.remain.snv.vcf.gz" | |||
File snv_remain_vcf_index = "${sampleName}.remain.snv.vcf.gz.tbi" | |||
File snv_padding_vcf = "${sampleName}.padding.snv.vcf.gz" | |||
File snv_padding_vcf_index = "${sampleName}.padding.snv.vcf.gz.tbi" | |||
File indel_true_vcf = "${sampleName}.true.indel.vcf.gz" | |||
File indel_true_vcf_index = "${sampleName}.true.indel.vcf.gz.tbi" | |||
File indel_false_vcf = "${sampleName}.false.indel.vcf.gz" | |||
File indel_false_vcf_index = "${sampleName}.false.indel.vcf.gz.tbi" | |||
File indel_remain_vcf = "${sampleName}.remain.indel.vcf.gz" | |||
File indel_remain_vcf_index = "${sampleName}.remain.indel.vcf.gz.tbi" | |||
File indel_padding_vcf = "${sampleName}.padding.indel.vcf.gz" | |||
File indel_padding_vcf_index = "${sampleName}.padding.indel.vcf.gz.tbi" | |||
} | |||
} | |||
@@ -1,26 +1,21 @@ | |||
task VCFrename { | |||
File mother_vcf_gz | |||
File father_vcf_gz | |||
File twins_vcf_gz | |||
File mother_vcf_idx | |||
File father_vcf_idx | |||
File twins_vcf_idx | |||
File trio_vcf_gz | |||
File trio_vcf_idx | |||
String mother_name | |||
String father_name | |||
String child_name | |||
String family_name | |||
String child | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
echo "MOTHER ${mother_name}" > mother_rename.txt | |||
rtg vcfannotate -i ${mother_vcf_gz} -o ${mother_name}.rename.vcf.gz --relabel=mother_rename.txt | |||
echo "MOTHER ${mother_name}.${child} | |||
FATHER ${father_name}.${child} | |||
CHILD ${child_name}" > rename.txt | |||
echo "FATHER ${father_name}" > father_rename.txt | |||
rtg vcfannotate -i ${father_vcf_gz} -o ${father_name}.rename.vcf.gz --relabel=father_rename.txt | |||
echo "CHILD ${family_name}" > child_rename.txt | |||
rtg vcfannotate -i ${twins_vcf_gz} -o ${family_name}.twins.rename.vcf.gz --relabel=child_rename.txt | |||
rtg vcfannotate -i ${trio_vcf_gz} -o ${family_name}.${child}.rename.vcf.gz --relabel=rename.txt | |||
>>> | |||
runtime { | |||
@@ -30,11 +25,7 @@ task VCFrename { | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File rename_mother_vcf_gz = "${mother_name}.rename.vcf.gz" | |||
File rename_father_vcf_gz = "${father_name}.rename.vcf.gz" | |||
File rename_twins_vcf_gz = "${family_name}.twins.rename.vcf.gz" | |||
File rename_mother_vcf_idx = "${mother_name}.rename.vcf.gz.tbi" | |||
File rename_father_vcf_idx = "${father_name}.rename.vcf.gz.tbi" | |||
File rename_twins_vcf_idx = "${family_name}.twins.rename.vcf.gz.tbi" | |||
File rename_trio_vcf_gz = "${family_name}.${child}.rename.vcf.gz" | |||
File rename_trio_vcf_idx = "${family_name}.${child}.rename.vcf.gz.tbi" | |||
} | |||
} |
@@ -1,10 +1,10 @@ | |||
task mendelian { | |||
File sister_vcf | |||
File child_vcf | |||
File LCL7_vcf | |||
File LCL8_vcf | |||
String LCL7_name | |||
String LCL8_name | |||
String family_name | |||
String child_name | |||
File ref_dir | |||
String fasta | |||
String docker | |||
@@ -13,20 +13,12 @@ task mendelian { | |||
command <<< | |||
export LD_LIBRARY_PATH=/opt/htslib-1.9 | |||
nt=$(nproc) | |||
mkdir VBT | |||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${sister_vcf} -outDir VBT -out-prefix ${family_name} --output-violation-regions | |||
cat VBT/${family_name}_trio.vcf | grep '#' | cut -f1-9,10 > mother_header | |||
cat VBT/${family_name}_trio.vcf | grep -v '#' | cut -f1-9,10 | grep 'MD=1' | grep -v '0/0' | cat mother_header - > ${LCL8_name}.sister.mendelian.gt.vcf | |||
cat VBT/${family_name}_trio.vcf | grep '#' | cut -f1-9,11 > father_header | |||
cat VBT/${family_name}_trio.vcf | grep -v '#' | cut -f1-9,11 | grep 'MD=1' | grep -v '0/0' | cat father_header - > ${LCL7_name}.sister.mendelian.gt.vcf | |||
cat VBT/${family_name}_trio.vcf | grep '#' | cut -f1-9,12 > twin_header | |||
cat VBT/${family_name}_trio.vcf | grep -v '#' | cut -f1-9,12 | grep 'MD=1' | grep -v '0/0' | cat twin_header - > ${family_name}.twins.sister.mendelian.gt.vcf | |||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${child_vcf} -outDir VBT -out-prefix ${child_name}.family --output-violation-regions -thread-count $nt | |||
cat VBT/${child_name}.family_trio.vcf > ${child_name}.family.vcf | |||
>>> | |||
runtime { | |||
@@ -37,8 +29,6 @@ task mendelian { | |||
} | |||
output { | |||
Array[File] vbt_mendelian = glob("VBT/*") | |||
File mother_vcf = "${LCL8_name}.sister.mendelian.gt.vcf" | |||
File father_vcf = "${LCL7_name}.sister.mendelian.gt.vcf" | |||
File twins_vcf = "${family_name}.twins.sister.mendelian.gt.vcf" | |||
File trio_vcf = "${child_name}.family.vcf" | |||
} | |||
} |
@@ -1,21 +1,15 @@ | |||
task merge { | |||
Array[File] rename_mother_vcf_gz | |||
Array[File] rename_mother_vcf_idx | |||
Array[File] rename_father_vcf_gz | |||
Array[File] rename_father_vcf_idx | |||
Array[File] rename_twins_vcf_gz | |||
Array[File] rename_twins_vcf_idx | |||
String mapper_caller | |||
Array[File] family_vcf_gz | |||
Array[File] family_vcf_idx | |||
String sample | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcfmerge --force-merge-all --no-gzip -o LCL8.${mapper_caller}.sister.consistent.merged.vcf ${sep=" " rename_mother_vcf_gz} | |||
rtg vcfmerge --force-merge-all --no-gzip -o LCL7.${mapper_caller}.sister.consistent.merged.vcf ${sep=" " rename_father_vcf_gz} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.vcf ${sep=" " family_vcf_gz} | |||
rtg vcfmerge --force-merge-all --no-gzip -o Twins.${mapper_caller}.sister.consistent.merged.vcf ${sep=" " rename_twins_vcf_gz} | |||
>>> | |||
runtime { | |||
@@ -25,8 +19,6 @@ task merge { | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File mother_merged_vcf = "LCL8.${mapper_caller}.sister.consistent.merged.vcf" | |||
File father_merged_vcf = "LCL7.${mapper_caller}.sister.consistent.merged.vcf" | |||
File twins_merged_vcf = "Twins.${mapper_caller}.sister.consistent.merged.vcf" | |||
File merged_vcf = "${sample}.merged.vcf" | |||
} | |||
} |
@@ -1,35 +0,0 @@ | |||
task mergeBed { | |||
Array[File] snv_true_bed | |||
Array[File] snv_false_bed | |||
Array[File] indel_true_bed | |||
Array[File] indel_false_bed | |||
Array[File] indel_padding | |||
Array[File] snv_padding | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_true_bed} ${sep=" " indel_true_bed} > merged.true.bed | |||
/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_false_bed} ${sep=" " indel_false_bed} > merged.false.bed | |||
/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_padding} ${sep=" " indel_padding} > merged.padding.bed | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File true_bed = "merged.true.bed" | |||
File false_bed = "merged.false.bed" | |||
File padding = "merged.padding.bed" | |||
} | |||
} | |||
@@ -0,0 +1,34 @@ | |||
task mergeSister { | |||
File LCL5_trio_vcf_gz | |||
File LCL5_trio_vcf_idx | |||
File LCL6_trio_vcf_gz | |||
File LCL6_trio_vcf_idx | |||
String family_name | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcfmerge -o LCL5.LCL6.merged.vcf.gz ${LCL5_trio_vcf_gz} ${LCL6_trio_vcf_gz} | |||
rtg vcfmerge -o LCL6.LCL5.merged.vcf.gz ${LCL6_trio_vcf_gz} ${LCL5_trio_vcf_gz} | |||
zcat LCL5.LCL6.merged.vcf.gz | grep '##' > header | |||
zcat LCL5.LCL6.merged.vcf.gz | grep -v '##' | cut -f8 > LCL5.mendelian | |||
zcat LCL6.LCL5.merged.vcf.gz | grep -v '##' | paste - LCL5.mendelian > body | |||
cat header body > ${family_name}.trio.info.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File family_mendelian_info = "${family_name}.trio.info.vcf" | |||
} | |||
} |
@@ -1,61 +0,0 @@ | |||
task mergeVCF { | |||
Array[File] snv_true_vcf | |||
Array[File] snv_true_vcf_index | |||
Array[File] snv_false_vcf | |||
Array[File] snv_false_vcf_index | |||
Array[File] snv_remain_vcf | |||
Array[File] snv_remain_vcf_index | |||
Array[File] snv_padding_vcf | |||
Array[File] snv_padding_vcf_index | |||
Array[File] indel_true_vcf | |||
Array[File] indel_true_vcf_index | |||
Array[File] indel_false_vcf | |||
Array[File] indel_false_vcf_index | |||
Array[File] indel_remain_vcf | |||
Array[File] indel_remain_vcf_index | |||
Array[File] indel_padding_vcf | |||
Array[File] indel_padding_vcf_index | |||
String quartet_sample | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.true.vcf.gz ${sep=" " snv_true_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.false.vcf.gz ${sep=" " snv_false_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.remain.vcf.gz ${sep=" " snv_remain_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.padding.vcf.gz ${sep=" " snv_padding_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.true.vcf.gz ${sep=" " indel_true_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.false.vcf.gz ${sep=" " indel_false_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.remain.vcf.gz ${sep=" " indel_remain_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.padding.vcf.gz ${sep=" " indel_padding_vcf} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File merged_snv_true = "${quartet_sample}.snv.true.vcf.gz" | |||
File merged_snv_false = "${quartet_sample}.snv.false.vcf.gz" | |||
File merged_snv_remain = "${quartet_sample}.snv.remain.vcf.gz" | |||
File merged_snv_padding = "${quartet_sample}.snv.padding.vcf.gz" | |||
File merged_indel_true = "${quartet_sample}.indel.true.vcf.gz" | |||
File merged_indel_false = "${quartet_sample}.indel.false.vcf.gz" | |||
File merged_indel_remain = "${quartet_sample}.indel.remain.vcf.gz" | |||
File merged_indel_padding = "${quartet_sample}.indel.padding.vcf.gz" | |||
} | |||
} |
@@ -0,0 +1,30 @@ | |||
task reformVCF { | |||
File family_mendelian_info | |||
File family_name | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File LCL5_family_info = "${family_name}.LCL5.vcf" | |||
File LCL6_family_info = "${family_name}.LCL6.vcf" | |||
File LCL7_family_info = "${family_name}.LCL7.vcf" | |||
File LCL8_family_info = "${family_name}.LCL8.vcf" | |||
File family_info = "${family_name}.vcf" | |||
} | |||
} | |||
@@ -1,21 +1,14 @@ | |||
task zipIndex { | |||
File mother_vcf | |||
File father_vcf | |||
File twins_vcf | |||
File vcf | |||
String sample | |||
String family_name | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg bgzip ${mother_vcf} -c > ${family_name}.LCL8.vcf.gz | |||
rtg index -f vcf ${family_name}.LCL8.vcf.gz | |||
rtg bgzip ${father_vcf} -c > ${family_name}.LCL7.vcf.gz | |||
rtg index -f vcf ${family_name}.LCL7.vcf.gz | |||
rtg bgzip ${twins_vcf} -c > ${family_name}.twins.vcf.gz | |||
rtg index -f vcf ${family_name}.twins.vcf.gz | |||
rtg bgzip ${vcf} -c > ${family_name}.${sample}.vcf.gz | |||
rtg index -f vcf ${family_name}.${sample}.vcf.gz | |||
>>> | |||
@@ -26,11 +19,7 @@ task zipIndex { | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File mother_vcf_gz = "${family_name}.LCL8.vcf.gz" | |||
File father_vcf_gz = "${family_name}.LCL7.vcf.gz" | |||
File twins_vcf_gz = "${family_name}.twins.vcf.gz" | |||
File mother_vcf_idx = "${family_name}.LCL8.vcf.gz.tbi" | |||
File father_vcf_idx = "${family_name}.LCL7.vcf.gz.tbi" | |||
File twins_vcf_idx = "${family_name}.twins.vcf.gz.tbi" | |||
File vcf_gz = "${family_name}.${sample}.vcf.gz" | |||
File vcf_idx = "${family_name}.${sample}.vcf.gz.tbi" | |||
} | |||
} |
@@ -1,10 +1,11 @@ | |||
import "./tasks/variantsNorm.wdl" as variantsNorm | |||
import "./tasks/sister.wdl" as sister | |||
import "./tasks/mendelian.wdl" as mendelian | |||
import "./tasks/zipIndex.wdl" as zipIndex | |||
import "./tasks/VCFrename.wdl" as VCFrename | |||
import "./tasks/mergeSister.wdl" as mergeSister | |||
import "./tasks/reformVCF.wdl" as reformVCF | |||
import "./tasks/merge.wdl" as merge | |||
workflow {{ project_name }} { | |||
File inputSamplesFile | |||
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | |||
@@ -12,102 +13,189 @@ workflow {{ project_name }} { | |||
String fasta | |||
String cluster_config | |||
String disk_size | |||
String mapper_caller | |||
scatter (sample in inputSamples){ | |||
scatter (quartet in inputSamples){ | |||
call variantsNorm.variantsNorm as LCL5variantsNorm{ | |||
input: | |||
vcf=sample[0], | |||
vcf=quartet[0], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sampleName=sample[4], | |||
sampleName=quartet[4], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call variantsNorm.variantsNorm as LCL6variantsNorm{ | |||
input: | |||
vcf=sample[1], | |||
vcf=quartet[1], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sampleName=sample[5], | |||
sampleName=quartet[5], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call variantsNorm.variantsNorm as LCL7variantsNorm{ | |||
input: | |||
vcf=sample[2], | |||
vcf=quartet[2], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sampleName=sample[6], | |||
sampleName=quartet[6], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call variantsNorm.variantsNorm as LCL8variantsNorm{ | |||
input: | |||
vcf=sample[3], | |||
vcf=quartet[3], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sampleName=sample[7], | |||
sampleName=quartet[7], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call sister.sister as sister{ | |||
call mendelian.mendelian as LCL5mendelian { | |||
input: | |||
LCL5_vcf=LCL5variantsNorm.normed_vcf, | |||
LCL6_vcf=LCL6variantsNorm.normed_vcf, | |||
child_vcf=LCL5variantsNorm.normed_vcf, | |||
LCL7_vcf=LCL7variantsNorm.normed_vcf, | |||
LCL8_vcf=LCL8variantsNorm.normed_vcf, | |||
LCL7_name=quartet[6], | |||
LCL8_name=quartet[7], | |||
child_name=quartet[4], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
family_name=sample[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call mendelian.mendelian as mendelian { | |||
input: | |||
sister_vcf=sister.sister_consistent_vcf, | |||
call mendelian.mendelian as LCL6mendelian { | |||
input: | |||
child_vcf=LCL6variantsNorm.normed_vcf, | |||
LCL7_vcf=LCL7variantsNorm.normed_vcf, | |||
LCL8_vcf=LCL8variantsNorm.normed_vcf, | |||
LCL7_name=sample[6], | |||
LCL8_name=sample[7], | |||
family_name=sample[8], | |||
LCL7_name=quartet[6], | |||
LCL8_name=quartet[7], | |||
child_name=quartet[5], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as zipIndex{ | |||
call zipIndex.zipIndex as LCL5zipIndex { | |||
input: | |||
vcf=LCL5mendelian.trio_vcf, | |||
sample="LCL5", | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL6zipIndex { | |||
input: | |||
vcf=LCL6mendelian.trio_vcf, | |||
sample="LCL6", | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call VCFrename.VCFrename as LCL5VCFrename { | |||
input: | |||
trio_vcf_gz=LCL5zipIndex.vcf_gz, | |||
trio_vcf_idx=LCL5zipIndex.vcf_idx, | |||
mother_name=quartet[7], | |||
father_name=quartet[6], | |||
child_name=quartet[4], | |||
family_name=quartet[8], | |||
child="LCL5", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call VCFrename.VCFrename as LCL6VCFrename { | |||
input: | |||
trio_vcf_gz=LCL6zipIndex.vcf_gz, | |||
trio_vcf_idx=LCL6zipIndex.vcf_idx, | |||
mother_name=quartet[7], | |||
father_name=quartet[6], | |||
child_name=quartet[5], | |||
family_name=quartet[8], | |||
child="LCL6", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call mergeSister.mergeSister as mergeSister { | |||
input: | |||
mother_vcf=mendelian.mother_vcf, | |||
father_vcf=mendelian.father_vcf, | |||
twins_vcf=mendelian.twins_vcf, | |||
family_name=sample[8], | |||
LCL5_trio_vcf_gz=LCL5VCFrename.rename_trio_vcf_gz, | |||
LCL5_trio_vcf_idx=LCL5VCFrename.rename_trio_vcf_idx, | |||
LCL6_trio_vcf_gz=LCL6VCFrename.rename_trio_vcf_gz, | |||
LCL6_trio_vcf_idx=LCL6VCFrename.rename_trio_vcf_idx, | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call VCFrename.VCFrename as VCFrename{ | |||
input: | |||
mother_vcf_gz=zipIndex.mother_vcf_gz, | |||
father_vcf_gz=zipIndex.father_vcf_gz, | |||
twins_vcf_gz=zipIndex.twins_vcf_gz, | |||
mother_vcf_idx=zipIndex.mother_vcf_idx, | |||
father_vcf_idx=zipIndex.father_vcf_idx, | |||
twins_vcf_idx=zipIndex.twins_vcf_idx, | |||
mother_name=sample[7], | |||
father_name=sample[6], | |||
family_name=sample[8], | |||
call reformVCF.reformVCF as reformVCF { | |||
input: | |||
family_mendelian_info=mergeSister.family_mendelian_info, | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL5familyzipIndex { | |||
input: | |||
vcf=reformVCF.LCL5_family_info, | |||
sample='LCL5', | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL6familyzipIndex { | |||
input: | |||
vcf=reformVCF.LCL6_family_info, | |||
sample='LCL6', | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL7familyzipIndex { | |||
input: | |||
vcf=reformVCF.LCL7_family_info, | |||
sample='LCL7', | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL8familyzipIndex { | |||
input: | |||
vcf=reformVCF.LCL8_family_info, | |||
sample='LCL8', | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
} | |||
call merge.merge as merge { | |||
call merge.merge as LCL5merge { | |||
input: | |||
family_vcf_gz=LCL5familyzipIndex.vcf_gz, | |||
family_vcf_idx=LCL5familyzipIndex.vcf_idx, | |||
sample="LCL5", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge.merge as LCL6merge { | |||
input: | |||
family_vcf_gz=LCL6familyzipIndex.vcf_gz, | |||
family_vcf_idx=LCL6familyzipIndex.vcf_idx, | |||
sample="LCL6", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge.merge as LCL7merge { | |||
input: | |||
family_vcf_gz=LCL7familyzipIndex.vcf_gz, | |||
family_vcf_idx=LCL7familyzipIndex.vcf_idx, | |||
sample="LCL7", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge.merge as LCL8merge { | |||
input: | |||
rename_mother_vcf_gz=VCFrename.rename_mother_vcf_gz, | |||
rename_mother_vcf_idx=VCFrename.rename_mother_vcf_idx, | |||
rename_father_vcf_gz=VCFrename.rename_father_vcf_gz, | |||
rename_father_vcf_idx=VCFrename.rename_father_vcf_idx, | |||
rename_twins_vcf_gz=VCFrename.rename_twins_vcf_gz, | |||
rename_twins_vcf_idx=VCFrename.rename_twins_vcf_idx, | |||
mapper_caller=mapper_caller, | |||
family_vcf_gz=LCL8familyzipIndex.vcf_gz, | |||
family_vcf_idx=LCL8familyzipIndex.vcf_idx, | |||
sample="LCL8", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} |