# import modules | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import pandas as pd | |||||
from operator import itemgetter | |||||
from collections import Counter | |||||
from itertools import islice | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to count voting number") | |||||
parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True) | |||||
parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True) | |||||
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||||
args = parser.parse_args() | |||||
multi_sample_vcf = args.multi_sample_vcf | |||||
dup_list = args.dup_list | |||||
prefix = args.prefix | |||||
sample_name = args.sample_name | |||||
vcf_header = '''##fileformat=VCFv4.2 | |||||
##fileDate=20191224 | |||||
##source=high_confidence_calls_intergration(choppy app) | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=PCT,Number=1,Type=Float,Description="Percentage of votes"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent"> | |||||
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# read in duplication list | |||||
dup = pd.read_table(dup_list,header=None) | |||||
var_dup = dup[0].tolist() | |||||
# output file | |||||
file_name = prefix + '_annotated.vcf' | |||||
outfile = open(file_name,'w') | |||||
# write VCF | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'consensus' + '\n' | |||||
outfile.write(vcf_header) | |||||
outfile.write(outputcolumn) | |||||
#function | |||||
def vote_percentage(strings): | |||||
strings = [x.replace('0/0','.') for x in strings] | |||||
gt = [x.split(':')[0] for x in strings] | |||||
percentage = round((33 - gt.count('.'))/33,2) | |||||
return(str(percentage)) | |||||
def decide_by_rep(strings): | |||||
consensus_rep = '' | |||||
mendelian = [x[-5:] for x in strings] | |||||
strings = [x.replace('.','0/0') for x in strings] | |||||
gt = [x.split(':')[0] for x in strings] | |||||
# mendelian consistent? | |||||
mendelian_dict = Counter(mendelian) | |||||
highest_mendelian = mendelian_dict.most_common(1) | |||||
candidate_mendelian = highest_mendelian[0][0] | |||||
freq_mendelian = highest_mendelian[0][1] | |||||
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2): | |||||
gt_num_dict = Counter(gt) | |||||
highest_gt = gt_num_dict.most_common(1) | |||||
candidate_gt = highest_gt[0][0] | |||||
freq_gt = highest_gt[0][1] | |||||
if (candidate_gt != '0/0') and (freq_gt >= 2): | |||||
consensus_rep = candidate_gt | |||||
elif (candidate_gt == '0/0') and (freq_gt >= 2): | |||||
consensus_rep = '0/0' | |||||
else: | |||||
consensus_rep = 'inconGT' | |||||
elif (candidate_mendelian == '.') and (freq_mendelian >= 2): | |||||
consensus_rep = 'noInfo' | |||||
else: | |||||
consensus_rep = 'inconMen' | |||||
return consensus_rep | |||||
def main(): | |||||
for line in fileinput.input(multi_sample_vcf): | |||||
headline = re.match('^\#',line) | |||||
if headline is not None: | |||||
pass | |||||
else: | |||||
line = line.strip() | |||||
strings = line.split('\t') | |||||
variant_id = '_'.join([strings[0],strings[1]]) | |||||
# check if the variants location is duplicated | |||||
if variant_id in var_dup: | |||||
outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\n' | |||||
outfile.write(outLine) | |||||
else: | |||||
# pre-define | |||||
pcr_consensus = '' | |||||
pcr_free_consensus = '' | |||||
consensus_call = '' | |||||
# pcr | |||||
pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings) | |||||
SEQ2000 = decide_by_rep(pcr[0:4]) | |||||
SEQ500 = decide_by_rep(pcr[4:7]) | |||||
Nova = decide_by_rep(pcr[7:11]) | |||||
XTen_ARD = decide_by_rep(pcr[11:14]) | |||||
XTen_NVG = decide_by_rep(pcr[14:17]) | |||||
XTen_WUX_1 = decide_by_rep(pcr[17:20]) | |||||
XTen_WUX_2 = decide_by_rep(pcr[20:23]) | |||||
sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2] | |||||
sequence_dict = Counter(sequence_site) | |||||
highest_sequence = sequence_dict.most_common(1) | |||||
candidate_sequence = highest_sequence[0][0] | |||||
freq_sequence = highest_sequence[0][1] | |||||
if freq_sequence > 4: | |||||
pcr_consensus = candidate_sequence | |||||
else: | |||||
pcr_consensus = 'inconSequenceSite' | |||||
# pcr-free | |||||
pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings) | |||||
SEQ2000 = decide_by_rep(pcr_free[0]) | |||||
Nova_ARD_1 = decide_by_rep(pcr_free[1:4]) | |||||
Nova_ARD_2 = decide_by_rep(pcr_free[4:7]) | |||||
Nova_BRG = decide_by_rep(pcr_free[7:10]) | |||||
sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG] | |||||
highest_sequence = sequence_dict.most_common(1) | |||||
candidate_sequence = highest_sequence[0][0] | |||||
freq_sequence = highest_sequence[0][1] | |||||
if freq_sequence > 2: | |||||
pcr_free_consensus = candidate_sequence | |||||
else: | |||||
pcr_free_consensus = 'inconSequenceSite' | |||||
# pcr and pcr-free | |||||
tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] | |||||
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): | |||||
consensus_call = pcr_consensus | |||||
strings[6] = 'reproducible' | |||||
elif (pcr_consensus in tag) or (pcr_free_consensus in tag): | |||||
consensus_call = 'filtered' | |||||
strings[6] = '.' | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0'): | |||||
consensus_call = 'pcr-free-speicifc' | |||||
strings[6] = '.' | |||||
elif (pcr_consensus != '0/0') and (pcr_consensus not in tag) and (pcr_free_consensus == '0/0'): | |||||
consensus_call = 'pcr-speicifc' | |||||
strings[6] = '.' | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||||
consensus_call = 'confirm for parents' | |||||
strings[6] = '.' | |||||
else: | |||||
consensus_call = 'filtered' | |||||
strings[6] = '.' | |||||
# percentage | |||||
percentage = vote_percentage(strings[9:]) | |||||
strings[7] = 'PCT=' + percentage | |||||
# output | |||||
outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\n' | |||||
outfile.write(outLine) | |||||
if __name__ == '__main__': | |||||
main() | |||||
# default columns, which will be included in the included in the calssifier | # default columns, which will be included in the included in the calssifier | ||||
vcfheader = '''##fileformat=VCFv4.2 | vcfheader = '''##fileformat=VCFv4.2 | ||||
##FILTER=<ID=PASS,Description="Voted by at least two replicates, six callers and two sequencing sites"> | |||||
##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||||
##INFO=<ID=sister,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent"> | |||||
##INFO=<ID=trioLCL5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##INFO=<ID=trioLCL6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent"> | |||||
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##contig=<ID=chr1,length=248956422> | ##contig=<ID=chr1,length=248956422> | ||||
##contig=<ID=chr2,length=242193529> | ##contig=<ID=chr2,length=242193529> | ||||
##contig=<ID=chr3,length=198295559> | ##contig=<ID=chr3,length=198295559> | ||||
pass | pass | ||||
# sister | # sister | ||||
if strings[11] == strings[14]: | if strings[11] == strings[14]: | ||||
info = "sister=1" | |||||
add_format = ":1" | |||||
else: | else: | ||||
info = "sister=0" | |||||
add_format = ":0" | |||||
# trioLCL5 | # trioLCL5 | ||||
if strings[15] == 'MD=1': | if strings[15] == 'MD=1': | ||||
info = info + ";trioLCL5=1" | |||||
add_format = add_format + ":1" | |||||
else: | else: | ||||
info = info + ";trioLCL5=0" | |||||
add_format = add_format + ":0" | |||||
# trioLCL6 | # trioLCL6 | ||||
if strings[7] == 'MD=1': | if strings[7] == 'MD=1': | ||||
info = info + ";trioLCL6=1" | |||||
add_format = add_format + ":1" | |||||
else: | else: | ||||
info = info + ";trioLCL6=0" | |||||
add_format = add_format + ":0" | |||||
# filter | |||||
if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'): | |||||
strings[6] = 'PASS' | |||||
else: | |||||
strings[6] = '.' | |||||
# output LCL5 | # output LCL5 | ||||
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[14] + '\n' | |||||
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n' | |||||
LCL5file.write(LCL5outLine) | LCL5file.write(LCL5outLine) | ||||
# output LCL6 | # output LCL6 | ||||
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[11] + '\n' | |||||
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n' | |||||
LCL6file.write(LCL6outLine) | LCL6file.write(LCL6outLine) | ||||
# output LCL7 | # output LCL7 | ||||
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[10] + '\n' | |||||
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n' | |||||
LCL7file.write(LCL7outLine) | LCL7file.write(LCL7outLine) | ||||
# output LCL8 | # output LCL8 | ||||
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[9] + '\n' | |||||
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n' | |||||
LCL8file.write(LCL8outLine) | LCL8file.write(LCL8outLine) | ||||
# output family | # output family | ||||
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+strings[5]+'\t'+strings[6]+'\t'+ info +'\t'+ strings[8] + '\t' + strings[14] + '\t' + strings[11] + '\t' + strings[10] + '\t' + strings[9] + '\n' | |||||
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n' | |||||
familyfile.write(familyoutLine) | familyfile.write(familyoutLine) | ||||
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | ||||
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL7votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||||
"{{ project_name }}.LCL6votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||||
"{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | "{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | ||||
"{{ project_name }}.mergeSister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.mergeSister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | "{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | ||||
"{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||||
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||||
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | "{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | ||||
"{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc", | "{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc", | ||||
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | "{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | ||||
} | } | ||||
task indelNorm { | |||||
File vcf | |||||
File ref_dir | |||||
String fasta | |||||
String sampleName | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
cat ${vcf} | grep '#' > header | |||||
cat ${vcf} | grep -v '#' > body | |||||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX\' > body.filtered | |||||
cat header body.filtered > ${sampleName}.filtered.vcf | |||||
/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File normed_vcf = "${sampleName}.normed.vcf" | |||||
} | |||||
} |
rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.vcf ${sep=" " family_vcf_gz} | rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.vcf ${sep=" " family_vcf_gz} | ||||
cat ${sample}.merged.vcf | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
} | } | ||||
output { | output { | ||||
File merged_vcf = "${sample}.merged.vcf" | File merged_vcf = "${sample}.merged.vcf" | ||||
File vcf_dup = "${sample}.vcf_dup.txt" | |||||
} | } | ||||
} | } |
task votes { | task votes { | ||||
Array[File] mother_vcf_gz | |||||
Array[File] mother_vcf_idx | |||||
Array[File] father_vcf_gz | |||||
Array[File] father_vcf_idx | |||||
Array[File] twins_vcf_gz | |||||
Array[File] twins_vcf_idx | |||||
File merged_vcf | |||||
String vcf_dup | |||||
String sample | |||||
String prefix | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
rtg vcfmerge --force-merge-all --no-gzip -o LCL8.sister.consistent.merged.vcf ${sep=" " mother_vcf_gz} | |||||
rtg vcfmerge --force-merge-all --no-gzip -o LCL7.sister.consistent.merged.vcf ${sep=" " father_vcf_gz} | |||||
rtg vcfmerge --force-merge-all --no-gzip -o Twins.sister.consistent.vcf ${sep=" " twins_vcf_gz} | |||||
python /opt/high_confidence_call_vote.py -vcf ${merged_vcf} -dup ${vcf_dup} -sample ${sample} -prefix ${prefix} | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File mother_merged_vcf = "LCL8.sister.consistent.merged.vcf" | |||||
File father_merged_vcf = "LCL7.sister.consistent.merged.vcf" | |||||
File twins_merged_vcf = "Twins.sister.consistent.merged.vcf" | |||||
File annotated_vcf = "${prefix}_annotated.vcf" | |||||
} | } | ||||
} | } |
import "./tasks/mergeSister.wdl" as mergeSister | import "./tasks/mergeSister.wdl" as mergeSister | ||||
import "./tasks/reformVCF.wdl" as reformVCF | import "./tasks/reformVCF.wdl" as reformVCF | ||||
import "./tasks/merge.wdl" as merge | import "./tasks/merge.wdl" as merge | ||||
import "./tasks/votes.wdl" as votes | |||||
workflow {{ project_name }} { | workflow {{ project_name }} { | ||||
File inputSamplesFile | File inputSamplesFile | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call votes.votes as LCL5votes{ | |||||
input: | |||||
merged_vcf=LCL5merge.merged_vcf, | |||||
vcf_dup=LCL5merge.vcf_dup, | |||||
sample='LCL5', | |||||
prefix='LCL5_consensus', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call merge.merge as LCL6merge { | call merge.merge as LCL6merge { | ||||
input: | input: | ||||
family_vcf_gz=LCL6familyzipIndex.vcf_gz, | family_vcf_gz=LCL6familyzipIndex.vcf_gz, | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call votes.votes as LCL6votes { | |||||
input: | |||||
merged_vcf=LCL6merge.merged_vcf, | |||||
vcf_dup=LCL6merge.vcf_dup, | |||||
sample='LCL6', | |||||
prefix='LCL6_consensus', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call merge.merge as LCL7merge { | call merge.merge as LCL7merge { | ||||
input: | input: | ||||
family_vcf_gz=LCL7familyzipIndex.vcf_gz, | family_vcf_gz=LCL7familyzipIndex.vcf_gz, | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call votes.votes as LCL7votes { | |||||
input: | |||||
merged_vcf=LCL7merge.merged_vcf, | |||||
vcf_dup=LCL7merge.vcf_dup, | |||||
sample='LCL7', | |||||
prefix='LCL7_consensus', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call merge.merge as LCL8merge { | call merge.merge as LCL8merge { | ||||
input: | input: | ||||
family_vcf_gz=LCL8familyzipIndex.vcf_gz, | family_vcf_gz=LCL8familyzipIndex.vcf_gz, | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call votes.votes as LCL8votes { | |||||
input: | |||||
merged_vcf=LCL8merge.merged_vcf, | |||||
vcf_dup=LCL8merge.vcf_dup, | |||||
sample='LCL8', | |||||
prefix='LCL8_consensus', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
} | } |