from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates") | |||||
parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True) | |||||
parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True) | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='The prefix of output filenames', required=True) | |||||
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) | |||||
args = parser.parse_args() | |||||
vcfInfo = args.vcfInfo | |||||
mendelianInfo = args.mendelianInfo | |||||
prefix = args.prefix | |||||
sample_name = args.sample_name | |||||
vcf_header = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200331 | |||||
##source=high_confidence_calls_intergration(choppy app) | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||||
##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes"> | |||||
##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes"> | |||||
##INFO=<ID=FPCT,Number=1,Type=Float,Description="Percentage of mendelian consisitent votes"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Int,Description="Depth"> | |||||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
vcf_header_all_sample = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200331 | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||||
##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes"> | |||||
##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes"> | |||||
##INFO=<ID=FPCT,Number=1,Type=Float,Description="Percentage of mendelian consisitent votes"> | |||||
##INFO=<ID=ALL_ALT,Number=1,Type=Float,Description="Sum of alternative reads of all samples"> | |||||
##INFO=<ID=ALL_DP,Number=1,Type=Float,Description="Sum of depth of all samples"> | |||||
##INFO=<ID=ALL_AF,Number=1,Type=Float,Description="Allele frequency of net alternatice reads and net depth"> | |||||
##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of genotype quality of all samples"> | |||||
##INFO=<ID=MQ_MEAN,Number=1,Type=Float,Description="Mean of mapping quality of all samples"> | |||||
##INFO=<ID=PCR,Number=1,Type=String,Description="Consensus of PCR votes"> | |||||
##INFO=<ID=PCR_FREE,Number=1,Type=String,Description="Consensus of PCR-free votes"> | |||||
##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Consensus calls"> | |||||
##INFO=<ID=CONSENSUS_SEQ,Number=1,Type=String,Description="Consensus sequence"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=DP,Number=1,Type=String,Description="Depth"> | |||||
##FORMAT=<ID=AF,Number=1,Type=String,Description="Allele frequency"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=String,Description="Genotype quality"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=String,Description="Mapping quality"> | |||||
##FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 is twins shared, 0 is twins discordant "> | |||||
##FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL5 mendelian consistent, 0 is mendelian vioaltion"> | |||||
##FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL6 mendelian consistent, 0 is mendelian vioaltion"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# output file | |||||
file_name = prefix + '_benchmarking_calls.vcf' | |||||
outfile = open(file_name,'w') | |||||
all_sample_file_name = prefix + '_all_sample_information.vcf' | |||||
all_sample_outfile = open(all_sample_file_name, 'w') | |||||
# write VCF | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '_high_confidence_calls\n' | |||||
outfile.write(vcf_header) | |||||
outfile.write(outputcolumn) | |||||
outputcolumn_all_sample = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+ \ | |||||
'Quartet_DNA_BGI_SEQ2000_BGI_1_20180518\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530\t' + \ | |||||
'Quartet_DNA_BGI_T7_WGE_1_20191105\tQuartet_DNA_BGI_T7_WGE_2_20191105\tQuartet_DNA_BGI_T7_WGE_3_20191105\t' + \ | |||||
'Quartet_DNA_ILM_Nova_ARD_1_20181108\tQuartet_DNA_ILM_Nova_ARD_2_20181108\tQuartet_DNA_ILM_Nova_ARD_3_20181108\t' + \ | |||||
'Quartet_DNA_ILM_Nova_ARD_4_20190111\tQuartet_DNA_ILM_Nova_ARD_5_20190111\tQuartet_DNA_ILM_Nova_ARD_6_20190111\t' + \ | |||||
'Quartet_DNA_ILM_Nova_BRG_1_20180930\tQuartet_DNA_ILM_Nova_BRG_2_20180930\tQuartet_DNA_ILM_Nova_BRG_3_20180930\t' + \ | |||||
'Quartet_DNA_ILM_Nova_WUX_1_20190917\tQuartet_DNA_ILM_Nova_WUX_2_20190917\tQuartet_DNA_ILM_Nova_WUX_3_20190917\t' + \ | |||||
'Quartet_DNA_ILM_XTen_ARD_1_20170403\tQuartet_DNA_ILM_XTen_ARD_2_20170403\tQuartet_DNA_ILM_XTen_ARD_3_20170403\t' + \ | |||||
'Quartet_DNA_ILM_XTen_NVG_1_20170329\tQuartet_DNA_ILM_XTen_NVG_2_20170329\tQuartet_DNA_ILM_XTen_NVG_3_20170329\t' + \ | |||||
'Quartet_DNA_ILM_XTen_WUX_1_20170216\tQuartet_DNA_ILM_XTen_WUX_2_20170216\tQuartet_DNA_ILM_XTen_WUX_3_20170216\n' | |||||
all_sample_outfile.write(vcf_header_all_sample) | |||||
all_sample_outfile.write(outputcolumn_all_sample) | |||||
# input files | |||||
vcf_info = pd.read_table(vcfInfo) | |||||
mendelian_info = pd.read_table(mendelianInfo) | |||||
merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_df = merged_df.fillna('.') | |||||
# function | |||||
def single_sample_format(format_x,strings_x,strings_y): | |||||
gt = '.' | |||||
dp = '.' | |||||
af = '.' | |||||
gq = '.' | |||||
mq = '.' | |||||
twins = '.' | |||||
trio5 = '.' | |||||
trio6 = '.' | |||||
# GT:DP:AF:GQ:MQ:TWINS:TRIO5:TRIO6 | |||||
# strings_x | |||||
format_strings = format_x.split(':') | |||||
if (strings_x == '.') and (strings_y != '.'): | |||||
element_strings_y = str(strings_y).split(':') | |||||
gt = '0/0' | |||||
dp = '.' | |||||
af = '.' | |||||
gq = '.' | |||||
mq = '.' | |||||
twins = element_strings_y[1] | |||||
trio5 = element_strings_y[2] | |||||
trio6 = element_strings_y[3] | |||||
elif (strings_x != '.') and (strings_y == '.'): | |||||
element_strings_x = strings_x.split(':') | |||||
formatDict = dict(zip(format_strings, element_strings_x)) | |||||
gt = formatDict['GT'] | |||||
dp = formatDict['DP'] | |||||
af = formatDict['AF'] | |||||
gq = formatDict['GQ'] | |||||
mq = formatDict['MQ'] | |||||
twins = '.' | |||||
trio5 = '.' | |||||
trio6 = '.' | |||||
elif (strings_x != '.') and (strings_y != '.'): | |||||
element_strings_y = str(strings_y).split(':') | |||||
element_strings_x = strings_x.split(':') | |||||
formatDict = dict(zip(format_strings, element_strings_x)) | |||||
gt = formatDict['GT'] | |||||
dp = formatDict['DP'] | |||||
af = formatDict['AF'] | |||||
gq = formatDict['GQ'] | |||||
mq = formatDict['MQ'] | |||||
twins = element_strings_y[1] | |||||
trio5 = element_strings_y[2] | |||||
trio6 = element_strings_y[3] | |||||
else: | |||||
pass | |||||
merged_format = gt + ':' + dp + ':' + af + ':' + gq + ':' + mq + ':' + twins + ':' + trio5 + ':' + trio6 | |||||
return(merged_format) | |||||
# | |||||
for row in merged_df.itertuples(): | |||||
vcf_count = row[10:37].count('.') | |||||
mendelian_count = row[50:77].count('.') | |||||
if vcf_count == mendelian_count: | |||||
info = 'location=' + str(row.location) + ';' + str(row.INFO_y) | |||||
if row.FILTER_y == 'reproducible': | |||||
ref = row.DP - row._42 | |||||
FORMAT = row[79] + ':' + str(int(ref)) + ',' + str(int(row._42)) + ':' + str(int(row.DP)) + ':' + str(round(row.AF,2)) + ':' + str(round(row.GQ,2)) + ':' + str(round(row.MQ,2)) | |||||
outline1 = str(row._1) + '\t' + str(row.POS) + '\t' + str(row.ID_x) + '\t' + str(row.REF_y) + '\t' + str(row[80]) + '\t' + '.' + '\t' + '.' + '\t' + str(info) + '\t' + 'GT:AD:DP:AF:GQ:MQ' + '\t' + str(FORMAT) + '\n' | |||||
outfile.write(outline1) | |||||
else: | |||||
pass | |||||
if row.INFO_x != '.': | |||||
if row.AF=='.': | |||||
info = 'location=' + str(row.location) + ';' + str(row.INFO_y) + ';' + 'ALL_ALT=' + str(int(row._42)) + ';' + 'ALL_DP=' + str(int(row.DP)) + ';' + 'ALL_AF=' + 'NA' + ';' + 'GQ_MEAN=' + str(row.GQ) + ';' + 'MQ_MEAN=' + str(row.MQ) + ';' + 'PCR=' + str(row[77]) + ';' + 'PCR_FREE=' + str(row[78]) + ';' + 'CONSENSUS=' + str(row[79]) + ';' + 'CONSENSUS_SEQ=' + str(row[80]) | |||||
else: | |||||
info = 'location=' + str(row.location) + ';' + str(row.INFO_y) + ';' + 'ALL_ALT=' + str(int(row._42)) + ';' + 'ALL_DP=' + str(int(row.DP)) + ';' + 'ALL_AF=' + str(round(float(row.AF),2)) + ';' + 'GQ_MEAN=' + str(row.GQ) + ';' + 'MQ_MEAN=' + str(row.MQ) + ';' + 'PCR=' + str(row[77]) + ';' + 'PCR_FREE=' + str(row[78]) + ';' + 'CONSENSUS=' + str(row[79]) + ';' + 'CONSENSUS_SEQ=' + str(row[80]) | |||||
Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5_x, row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5_y) | |||||
Quartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5_x, row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5_y) | |||||
Quartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5_x, row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5_y) | |||||
Quartet_DNA_BGI_T7_WGE_1_20191105_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_T7_WGE_1_20191105_LCL5_x, row.Quartet_DNA_BGI_T7_WGE_1_20191105_LCL5_y) | |||||
Quartet_DNA_BGI_T7_WGE_2_20191105_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_T7_WGE_2_20191105_LCL5_x, row.Quartet_DNA_BGI_T7_WGE_2_20191105_LCL5_y) | |||||
Quartet_DNA_BGI_T7_WGE_3_20191105_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_T7_WGE_3_20191105_LCL5_x, row.Quartet_DNA_BGI_T7_WGE_3_20191105_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_1_20181108_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_1_20181108_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_1_20181108_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_2_20181108_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_2_20181108_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_2_20181108_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_3_20181108_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_3_20181108_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_3_20181108_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_4_20190111_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_4_20190111_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_4_20190111_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_5_20190111_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_5_20190111_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_5_20190111_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_6_20190111_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_6_20190111_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_6_20190111_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_BRG_1_20180930_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_BRG_1_20180930_LCL5_x, row.Quartet_DNA_ILM_Nova_BRG_1_20180930_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_BRG_2_20180930_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_BRG_2_20180930_LCL5_x, row.Quartet_DNA_ILM_Nova_BRG_2_20180930_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_BRG_3_20180930_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_BRG_3_20180930_LCL5_x, row.Quartet_DNA_ILM_Nova_BRG_3_20180930_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_WUX_1_20190917_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_WUX_1_20190917_LCL5_x, row.Quartet_DNA_ILM_Nova_WUX_1_20190917_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_WUX_2_20190917_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_WUX_2_20190917_LCL5_x, row.Quartet_DNA_ILM_Nova_WUX_2_20190917_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_WUX_3_20190917_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_WUX_3_20190917_LCL5_x, row.Quartet_DNA_ILM_Nova_WUX_3_20190917_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_ARD_1_20170403_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_ARD_1_20170403_LCL5_x, row.Quartet_DNA_ILM_XTen_ARD_1_20170403_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_ARD_2_20170403_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_ARD_2_20170403_LCL5_x, row.Quartet_DNA_ILM_XTen_ARD_2_20170403_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_ARD_3_20170403_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_ARD_3_20170403_LCL5_x, row.Quartet_DNA_ILM_XTen_ARD_3_20170403_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_NVG_1_20170329_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_NVG_1_20170329_LCL5_x, row.Quartet_DNA_ILM_XTen_NVG_1_20170329_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_NVG_2_20170329_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_NVG_2_20170329_LCL5_x, row.Quartet_DNA_ILM_XTen_NVG_2_20170329_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_NVG_3_20170329_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_NVG_3_20170329_LCL5_x, row.Quartet_DNA_ILM_XTen_NVG_3_20170329_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_WUX_1_20170216_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_WUX_1_20170216_LCL5_x, row.Quartet_DNA_ILM_XTen_WUX_1_20170216_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_WUX_2_20170216_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_WUX_2_20170216_LCL5_x, row.Quartet_DNA_ILM_XTen_WUX_2_20170216_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_WUX_3_20170216_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_WUX_3_20170216_LCL5_x, row.Quartet_DNA_ILM_XTen_WUX_3_20170216_LCL5_y) | |||||
outline2 = str(row._1) + '\t' + str(row.POS) + '\t' + str(row.ID_x) +'\t' + str(row.REF_x) + '\t' + str(row.ALT_x) + '\t' + '.' + '\t' + '.' + '\t' + str(info) + '\t' + 'GT:DP:AF:GQ:MQ:TWINS:TRIO5:TRIO6' + '\t' \ | |||||
+ str(Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5) + '\t' + str(Quartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5) + '\t' + str(Quartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_BGI_T7_WGE_1_20191105_LCL5) + '\t' + str(Quartet_DNA_BGI_T7_WGE_2_20191105_LCL5) + '\t' + str(Quartet_DNA_BGI_T7_WGE_3_20191105_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_Nova_ARD_1_20181108_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_ARD_2_20181108_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_ARD_3_20181108_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_Nova_ARD_4_20190111_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_ARD_5_20190111_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_ARD_6_20190111_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_Nova_BRG_1_20180930_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_BRG_2_20180930_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_BRG_3_20180930_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_Nova_WUX_1_20190917_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_WUX_2_20190917_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_WUX_3_20190917_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_XTen_ARD_1_20170403_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_ARD_2_20170403_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_ARD_3_20170403_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_XTen_NVG_1_20170329_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_NVG_2_20170329_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_NVG_3_20170329_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_XTen_WUX_1_20170216_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_WUX_2_20170216_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_WUX_3_20170216_LCL5) + '\n' | |||||
all_sample_outfile.write(outline2) | |||||
else: | |||||
info = '.' | |||||
Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5_x, row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5_y) | |||||
Quartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5_x, row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5_y) | |||||
Quartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5_x, row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5_y) | |||||
Quartet_DNA_BGI_T7_WGE_1_20191105_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_T7_WGE_1_20191105_LCL5_x, row.Quartet_DNA_BGI_T7_WGE_1_20191105_LCL5_y) | |||||
Quartet_DNA_BGI_T7_WGE_2_20191105_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_T7_WGE_2_20191105_LCL5_x, row.Quartet_DNA_BGI_T7_WGE_2_20191105_LCL5_y) | |||||
Quartet_DNA_BGI_T7_WGE_3_20191105_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_BGI_T7_WGE_3_20191105_LCL5_x, row.Quartet_DNA_BGI_T7_WGE_3_20191105_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_1_20181108_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_1_20181108_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_1_20181108_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_2_20181108_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_2_20181108_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_2_20181108_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_3_20181108_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_3_20181108_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_3_20181108_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_4_20190111_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_4_20190111_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_4_20190111_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_5_20190111_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_5_20190111_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_5_20190111_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_ARD_6_20190111_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_ARD_6_20190111_LCL5_x, row.Quartet_DNA_ILM_Nova_ARD_6_20190111_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_BRG_1_20180930_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_BRG_1_20180930_LCL5_x, row.Quartet_DNA_ILM_Nova_BRG_1_20180930_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_BRG_2_20180930_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_BRG_2_20180930_LCL5_x, row.Quartet_DNA_ILM_Nova_BRG_2_20180930_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_BRG_3_20180930_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_BRG_3_20180930_LCL5_x, row.Quartet_DNA_ILM_Nova_BRG_3_20180930_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_WUX_1_20190917_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_WUX_1_20190917_LCL5_x, row.Quartet_DNA_ILM_Nova_WUX_1_20190917_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_WUX_2_20190917_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_WUX_2_20190917_LCL5_x, row.Quartet_DNA_ILM_Nova_WUX_2_20190917_LCL5_y) | |||||
Quartet_DNA_ILM_Nova_WUX_3_20190917_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_Nova_WUX_3_20190917_LCL5_x, row.Quartet_DNA_ILM_Nova_WUX_3_20190917_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_ARD_1_20170403_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_ARD_1_20170403_LCL5_x, row.Quartet_DNA_ILM_XTen_ARD_1_20170403_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_ARD_2_20170403_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_ARD_2_20170403_LCL5_x, row.Quartet_DNA_ILM_XTen_ARD_2_20170403_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_ARD_3_20170403_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_ARD_3_20170403_LCL5_x, row.Quartet_DNA_ILM_XTen_ARD_3_20170403_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_NVG_1_20170329_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_NVG_1_20170329_LCL5_x, row.Quartet_DNA_ILM_XTen_NVG_1_20170329_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_NVG_2_20170329_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_NVG_2_20170329_LCL5_x, row.Quartet_DNA_ILM_XTen_NVG_2_20170329_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_NVG_3_20170329_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_NVG_3_20170329_LCL5_x, row.Quartet_DNA_ILM_XTen_NVG_3_20170329_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_WUX_1_20170216_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_WUX_1_20170216_LCL5_x, row.Quartet_DNA_ILM_XTen_WUX_1_20170216_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_WUX_2_20170216_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_WUX_2_20170216_LCL5_x, row.Quartet_DNA_ILM_XTen_WUX_2_20170216_LCL5_y) | |||||
Quartet_DNA_ILM_XTen_WUX_3_20170216_LCL5 = single_sample_format(row.FORMAT_x, row.Quartet_DNA_ILM_XTen_WUX_3_20170216_LCL5_x, row.Quartet_DNA_ILM_XTen_WUX_3_20170216_LCL5_y) | |||||
outline2 = str(row._1) + '\t' + str(row.POS) + '\t' + str(row.ID_x) +'\t' + str(row.REF_x) + '\t' + str(row.ALT_x) + '\t' + '.' + '\t' + '.' + '\t' + str(info) + '\t' + 'GT:DP:AF:GQ:MQ:TWINS:TRIO5:TRIO6' + '\t' \ | |||||
+ str(Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5) + '\t' + str(Quartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5) + '\t' + str(Quartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_BGI_T7_WGE_1_20191105_LCL5) + '\t' + str(Quartet_DNA_BGI_T7_WGE_2_20191105_LCL5) + '\t' + str(Quartet_DNA_BGI_T7_WGE_3_20191105_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_Nova_ARD_1_20181108_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_ARD_2_20181108_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_ARD_3_20181108_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_Nova_ARD_4_20190111_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_ARD_5_20190111_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_ARD_6_20190111_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_Nova_BRG_1_20180930_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_BRG_2_20180930_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_BRG_3_20180930_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_Nova_WUX_1_20190917_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_WUX_2_20190917_LCL5) + '\t' + str(Quartet_DNA_ILM_Nova_WUX_3_20190917_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_XTen_ARD_1_20170403_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_ARD_2_20170403_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_ARD_3_20170403_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_XTen_NVG_1_20170329_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_NVG_2_20170329_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_NVG_3_20170329_LCL5) + '\t' \ | |||||
+ str(Quartet_DNA_ILM_XTen_WUX_1_20170216_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_WUX_2_20170216_LCL5) + '\t' + str(Quartet_DNA_ILM_XTen_WUX_3_20170216_LCL5) + '\n' | |||||
all_sample_outfile.write(outline2) | |||||
else: | |||||
from operator import itemgetter | from operator import itemgetter | ||||
from collections import Counter | from collections import Counter | ||||
from itertools import islice | from itertools import islice | ||||
from numpy import * | |||||
import statistics | |||||
# input arguments | # input arguments | ||||
parser = argparse.ArgumentParser(description="this script is to count voting number") | parser = argparse.ArgumentParser(description="this script is to count voting number") | ||||
sample_name = args.sample_name | sample_name = args.sample_name | ||||
vcf_header = '''##fileformat=VCFv4.2 | vcf_header = '''##fileformat=VCFv4.2 | ||||
##fileDate=20191224 | |||||
##fileDate=20200331 | |||||
##source=high_confidence_calls_intergration(choppy app) | ##source=high_confidence_calls_intergration(choppy app) | ||||
##reference=GRCh38.d1.vd1 | ##reference=GRCh38.d1.vd1 | ||||
##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes"> | |||||
##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes"> | |||||
##INFO=<ID=FPCT,Number=1,Type=Float,Description="Percentage of mendelian consisitent votes"> | |||||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||||
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes"> | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes"> | |||||
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples"> | |||||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency, sum alternative depth / sum depth"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Average genotype quality"> | |||||
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Average mapping quality"> | |||||
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||||
##FORMAT=<ID=QUALI,Number=1,Type=Float,Description="Average variant quality"> | |||||
##contig=<ID=chr1,length=248956422> | ##contig=<ID=chr1,length=248956422> | ||||
##contig=<ID=chr2,length=242193529> | ##contig=<ID=chr2,length=242193529> | ||||
##contig=<ID=chr3,length=198295559> | ##contig=<ID=chr3,length=198295559> | ||||
##contig=<ID=chrX,length=156040895> | ##contig=<ID=chrX,length=156040895> | ||||
''' | ''' | ||||
vcf_header_all_sample = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200331 | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||||
##INFO=<ID=DUP,Number=1,Type=Flag,Description="Duplicated variant records"> | |||||
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes"> | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes"> | |||||
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##INFO=<ID=ALL_ALT,Number=1,Type=Float,Description="Sum of alternative reads of all samples"> | |||||
##INFO=<ID=ALL_DP,Number=1,Type=Float,Description="Sum of depth of all samples"> | |||||
##INFO=<ID=ALL_AF,Number=1,Type=Float,Description="Allele frequency of net alternatice reads and net depth"> | |||||
##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of genotype quality of all samples"> | |||||
##INFO=<ID=QD_MEAN,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth"> | |||||
##INFO=<ID=MQ_MEAN,Number=1,Type=Float,Description="Mean of mapping quality of all samples"> | |||||
##INFO=<ID=FS_MEAN,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||||
##INFO=<ID=QUAL_MEAN,Number=1,Type=Float,Description="Average variant quality"> | |||||
##INFO=<ID=PCR,Number=1,Type=String,Description="Consensus of PCR votes"> | |||||
##INFO=<ID=PCR_FREE,Number=1,Type=String,Description="Consensus of PCR-free votes"> | |||||
##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Consensus calls"> | |||||
##INFO=<ID=CONSENSUS_SEQ,Number=1,Type=String,Description="Consensus sequence"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=DP,Number=1,Type=String,Description="Depth"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth"> | |||||
##FORMAT=<ID=AF,Number=1,Type=String,Description="Allele frequency"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=String,Description="Genotype quality"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=String,Description="Mapping quality"> | |||||
##FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 is twins shared, 0 is twins discordant "> | |||||
##FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL5 mendelian consistent, 0 is mendelian vioaltion"> | |||||
##FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL6 mendelian consistent, 0 is mendelian vioaltion"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# read in duplication list | # read in duplication list | ||||
dup = pd.read_table(dup_list,header=None) | dup = pd.read_table(dup_list,header=None) | ||||
var_dup = dup[0].tolist() | var_dup = dup[0].tolist() | ||||
# output file | # output file | ||||
file_name = prefix + '_annotated.vcf' | |||||
outfile = open(file_name,'w') | |||||
benchmark_file_name = prefix + '_voted.vcf' | |||||
benchmark_outfile = open(benchmark_file_name,'w') | |||||
all_sample_file_name = prefix + '_all_sample_information.vcf' | |||||
all_sample_outfile = open(all_sample_file_name,'w') | |||||
# write VCF | # write VCF | ||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n' | |||||
outfile.write(vcf_header) | |||||
outfile.write(outputcolumn) | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '_benchmark_calls\n' | |||||
benchmark_outfile.write(vcf_header) | |||||
benchmark_outfile.write(outputcolumn) | |||||
outputcolumn_all_sample = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+ \ | |||||
'Quartet_DNA_BGI_SEQ2000_BGI_1_20180518\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530\t' + \ | |||||
'Quartet_DNA_BGI_T7_WGE_1_20191105\tQuartet_DNA_BGI_T7_WGE_2_20191105\tQuartet_DNA_BGI_T7_WGE_3_20191105\t' + \ | |||||
'Quartet_DNA_ILM_Nova_ARD_1_20181108\tQuartet_DNA_ILM_Nova_ARD_2_20181108\tQuartet_DNA_ILM_Nova_ARD_3_20181108\t' + \ | |||||
'Quartet_DNA_ILM_Nova_ARD_4_20190111\tQuartet_DNA_ILM_Nova_ARD_5_20190111\tQuartet_DNA_ILM_Nova_ARD_6_20190111\t' + \ | |||||
'Quartet_DNA_ILM_Nova_BRG_1_20180930\tQuartet_DNA_ILM_Nova_BRG_2_20180930\tQuartet_DNA_ILM_Nova_BRG_3_20180930\t' + \ | |||||
'Quartet_DNA_ILM_Nova_WUX_1_20190917\tQuartet_DNA_ILM_Nova_WUX_2_20190917\tQuartet_DNA_ILM_Nova_WUX_3_20190917\t' + \ | |||||
'Quartet_DNA_ILM_XTen_ARD_1_20170403\tQuartet_DNA_ILM_XTen_ARD_2_20170403\tQuartet_DNA_ILM_XTen_ARD_3_20170403\t' + \ | |||||
'Quartet_DNA_ILM_XTen_NVG_1_20170329\tQuartet_DNA_ILM_XTen_NVG_2_20170329\tQuartet_DNA_ILM_XTen_NVG_3_20170329\t' + \ | |||||
'Quartet_DNA_ILM_XTen_WUX_1_20170216\tQuartet_DNA_ILM_XTen_WUX_2_20170216\tQuartet_DNA_ILM_XTen_WUX_3_20170216\n' | |||||
all_sample_outfile.write(vcf_header_all_sample) | |||||
all_sample_outfile.write(outputcolumn_all_sample) | |||||
#function | #function | ||||
def replace_nan(strings_list): | |||||
updated_list = [] | |||||
for i in strings_list: | |||||
if i == '.': | |||||
updated_list.append('.:.:.:.:.:.:.:.:.:.:.:.') | |||||
else: | |||||
updated_list.append(i) | |||||
return updated_list | |||||
def detected_percentage(strings): | |||||
strings = [x.replace('0/0','.') for x in strings] | |||||
def remove_dot(strings_list): | |||||
updated_list = [] | |||||
for i in strings_list: | |||||
if i == '.': | |||||
pass | |||||
else: | |||||
updated_list.append(i) | |||||
return updated_list | |||||
def detected_number(strings): | |||||
gt = [x.split(':')[0] for x in strings] | gt = [x.split(':')[0] for x in strings] | ||||
percentage = round((27 - gt.count('.'))/27,4) | |||||
percentage = 27 - gt.count('.') | |||||
return(str(percentage)) | return(str(percentage)) | ||||
def vote_percentage(strings,consensus_call): | |||||
strings = [x.replace('.','0/0') for x in strings] | |||||
def vote_number(strings,consensus_call): | |||||
gt = [x.split(':')[0] for x in strings] | gt = [x.split(':')[0] for x in strings] | ||||
gt = [x.replace('.','0/0') for x in gt] | |||||
gt = list(map(gt_uniform,[i for i in gt])) | gt = list(map(gt_uniform,[i for i in gt])) | ||||
percentage = round(gt.count(consensus_call)/27,4) | |||||
return(str(percentage)) | |||||
vote_num = gt.count(consensus_call) | |||||
return(str(vote_num)) | |||||
def family_vote(strings,consensus_call): | def family_vote(strings,consensus_call): | ||||
strings = [x.replace('.','0/0') for x in strings] | |||||
gt = [x.split(':')[0] for x in strings] | gt = [x.split(':')[0] for x in strings] | ||||
gt = [x.replace('.','0/0') for x in gt] | |||||
gt = list(map(gt_uniform,[i for i in gt])) | gt = list(map(gt_uniform,[i for i in gt])) | ||||
mendelian = [x[-5:] for x in strings] | |||||
mendelian = [':'.join(x.split(':')[1:4]) for x in strings] | |||||
indices = [i for i, x in enumerate(gt) if x == consensus_call] | indices = [i for i, x in enumerate(gt) if x == consensus_call] | ||||
matched_mendelian = itemgetter(*indices)(mendelian) | matched_mendelian = itemgetter(*indices)(mendelian) | ||||
percentage = round(matched_mendelian.count('1:1:1')/27,4) | |||||
return(str(percentage)) | |||||
mendelian_num = matched_mendelian.count('1:1:1') | |||||
return(str(mendelian_num)) | |||||
def gt_uniform(strings): | def gt_uniform(strings): | ||||
uniformed_gt = '' | uniformed_gt = '' | ||||
def decide_by_rep(strings): | def decide_by_rep(strings): | ||||
consensus_rep = '' | consensus_rep = '' | ||||
mendelian = [x[-5:] for x in strings] | |||||
strings = [x.replace('.','0/0') for x in strings] | |||||
mendelian = [':'.join(x.split(':')[1:4]) for x in strings] | |||||
gt = [x.split(':')[0] for x in strings] | gt = [x.split(':')[0] for x in strings] | ||||
gt = [x.replace('.','0/0') for x in gt] | |||||
# modified gt turn 2/1 to 1/2 | # modified gt turn 2/1 to 1/2 | ||||
gt = list(map(gt_uniform,[i for i in gt])) | gt = list(map(gt_uniform,[i for i in gt])) | ||||
# mendelian consistent? | # mendelian consistent? | ||||
consensus_rep = '0/0' | consensus_rep = '0/0' | ||||
else: | else: | ||||
consensus_rep = 'inconGT' | consensus_rep = 'inconGT' | ||||
elif (candidate_mendelian == '.') and (freq_mendelian >= 2): | |||||
elif (candidate_mendelian == '') and (freq_mendelian >= 2): | |||||
consensus_rep = 'noInfo' | consensus_rep = 'noInfo' | ||||
else: | else: | ||||
consensus_rep = 'inconMen' | consensus_rep = 'inconMen' | ||||
variant_id = '_'.join([strings[0],strings[1]]) | variant_id = '_'.join([strings[0],strings[1]]) | ||||
# check if the variants location is duplicated | # check if the variants location is duplicated | ||||
if variant_id in var_dup: | if variant_id in var_dup: | ||||
strings[6] = 'dupVar' | |||||
outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n' | |||||
outfile.write(outLine) | |||||
strings[7] = strings[7] + ';DUP' | |||||
outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(outLine) | |||||
else: | else: | ||||
# pre-define | # pre-define | ||||
pcr_consensus = '.' | pcr_consensus = '.' | ||||
consensus_call = '.' | consensus_call = '.' | ||||
consensus_alt_seq = '.' | consensus_alt_seq = '.' | ||||
# pcr | # pcr | ||||
strings[9:] = replace_nan(strings[9:]) | |||||
pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings) | pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings) | ||||
SEQ2000 = decide_by_rep(pcr[0:3]) | SEQ2000 = decide_by_rep(pcr[0:3]) | ||||
XTen_ARD = decide_by_rep(pcr[3:6]) | XTen_ARD = decide_by_rep(pcr[3:6]) | ||||
pcr_consensus = 'inconSequenceSite' | pcr_consensus = 'inconSequenceSite' | ||||
# pcr-free | # pcr-free | ||||
pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings) | pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings) | ||||
#SEQ2000 = decide_by_rep(pcr_free[0]) | |||||
T7_WGE = decide_by_rep(pcr_free[0:3]) | T7_WGE = decide_by_rep(pcr_free[0:3]) | ||||
Nova_ARD_1 = decide_by_rep(pcr_free[3:6]) | Nova_ARD_1 = decide_by_rep(pcr_free[3:6]) | ||||
Nova_ARD_2 = decide_by_rep(pcr_free[6:9]) | Nova_ARD_2 = decide_by_rep(pcr_free[6:9]) | ||||
tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] | tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] | ||||
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): | if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): | ||||
consensus_call = pcr_consensus | consensus_call = pcr_consensus | ||||
VPCT = vote_percentage(strings[9:],consensus_call) | |||||
strings[7] = 'VPCT=' + VPCT | |||||
DPCT = detected_percentage(strings[9:]) | |||||
strings[7] = strings[7] + ';DPCT=' + DPCT | |||||
FPCT = family_vote(strings[9:],consensus_call) | |||||
strings[7] = strings[7] + ';FPCT=' + FPCT | |||||
VOTED = vote_number(strings[9:],consensus_call) | |||||
strings[7] = strings[7] + ';VOTED=' + VOTED | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
FAM = family_vote(strings[9:],consensus_call) | |||||
strings[7] = strings[7] + ';FAM=' + FAM | |||||
# Delete multiple alternative genotype to necessary expression | # Delete multiple alternative genotype to necessary expression | ||||
strings[6] = 'reproducible' | |||||
alt = strings[4] | alt = strings[4] | ||||
alt_gt = alt.split(',') | alt_gt = alt.split(',') | ||||
if len(alt_gt) > 1: | if len(alt_gt) > 1: | ||||
consensus_call = '1/1' | consensus_call = '1/1' | ||||
else: | else: | ||||
consensus_alt_seq = alt | consensus_alt_seq = alt | ||||
# GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL | |||||
# GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT | |||||
# DP | |||||
DP = [x.split(':')[4] for x in strings[9:]] | |||||
DP = remove_dot(DP) | |||||
DP = [int(x) for x in DP] | |||||
ALL_DP = sum(DP) | |||||
# AF | |||||
ALT = [x.split(':')[5] for x in strings[9:]] | |||||
ALT = remove_dot(ALT) | |||||
ALT = [int(x) for x in ALT] | |||||
ALL_ALT = sum(ALT) | |||||
ALL_AF = round(ALL_ALT/ALL_DP,2) | |||||
# GQ | |||||
GQ = [x.split(':')[7] for x in strings[9:]] | |||||
GQ = remove_dot(GQ) | |||||
GQ = [int(x) for x in GQ] | |||||
GQ_MEAN = round(mean(GQ),2) | |||||
# QD | |||||
QD = [x.split(':')[8] for x in strings[9:]] | |||||
QD = remove_dot(QD) | |||||
QD = [float(x) for x in QD] | |||||
QD_MEAN = round(mean(QD),2) | |||||
# MQ | |||||
MQ = [x.split(':')[9] for x in strings[9:]] | |||||
MQ = remove_dot(MQ) | |||||
MQ = [float(x) for x in MQ] | |||||
MQ_MEAN = round(mean(MQ),2) | |||||
# FS | |||||
FS = [x.split(':')[10] for x in strings[9:]] | |||||
FS = remove_dot(FS) | |||||
FS = [float(x) for x in FS] | |||||
FS_MEAN = round(mean(FS),2) | |||||
# QUAL | |||||
QUAL = [x.split(':')[11] for x in strings[9:]] | |||||
QUAL = remove_dot(QUAL) | |||||
QUAL = [float(x) for x in QUAL] | |||||
QUAL_MEAN = round(mean(QUAL),2) | |||||
# benchmark output | |||||
output_format = consensus_call + ':' + str(ALL_DP) + ':' + str(ALL_ALT) + ':' + str(ALL_AF) + ':' + str(GQ_MEAN) + ':' + str(QD_MEAN) + ':' + str(MQ_MEAN) + ':' + str(FS_MEAN) + ':' + str(QUAL_MEAN) | |||||
outLine = strings[0] + '\t' + strings[1] + '\t' + strings[2] + '\t' + strings[3] + '\t' + consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' + strings[7] + '\t' + 'GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + output_format + '\n' | |||||
benchmark_outfile.write(outLine) | |||||
# all sample output | |||||
strings[7] = strings[7] + ';ALL_ALT=' + str(ALL_ALT) + ';ALL_DP=' + str(ALL_DP) + ';ALL_AF=' + str(ALL_AF) \ | |||||
+ ';GQ_MEAN=' + str(GQ_MEAN) + ';QD_MEAN=' + str(QD_MEAN) + ';MQ_MEAN=' + str(MQ_MEAN) + ';FS_MEAN=' + str(FS_MEAN) \ | |||||
+ ';QUAL_MEAN=' + str(QUAL_MEAN) + ';PCR=' + consensus_call + ';PCR_FREE=' + consensus_call + ';CONSENSUS=' + consensus_call \ | |||||
+ ';CONSENSUS_SEQ=' + consensus_alt_seq | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | ||||
consensus_call = 'filtered' | consensus_call = 'filtered' | ||||
strings[6] = 'filtered' | |||||
DPCT = detected_percentage(strings[9:]) | |||||
strings[7] = 'DPCT=' + DPCT | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')): | elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')): | ||||
consensus_call = 'pcr-free-speicifc' | consensus_call = 'pcr-free-speicifc' | ||||
strings[6] = 'pcr-free-speicifc' | |||||
DPCT = detected_percentage(strings[9:]) | |||||
strings[7] = 'DPCT=' + DPCT | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')): | elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')): | ||||
consensus_call = 'pcr-speicifc' | consensus_call = 'pcr-speicifc' | ||||
strings[6] = 'pcr-speicifc' | |||||
DPCT = detected_percentage(strings[9:]) | |||||
strings[7] = 'DPCT=' + DPCT | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call + ';PCR=' + pcr_consensus + ';PCR_FREE=' + pcr_free_consensus | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | ||||
consensus_call = 'confirm for parents' | |||||
strings[6] = 'confirm for parents' | |||||
DPCT = detected_percentage(strings[9:]) | |||||
strings[7] = 'DPCT=' + DPCT | |||||
consensus_call = 'confirm for parents' | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
else: | else: | ||||
consensus_call = 'filtered' | consensus_call = 'filtered' | ||||
strings[6] = 'filtered' | |||||
DPCT = detected_percentage(strings[9:]) | |||||
strings[7] = 'DPCT=' + DPCT | |||||
# output | |||||
outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n' | |||||
outfile.write(outLine) | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
main() | main() |
##fileDate=20200331 | ##fileDate=20200331 | ||||
##source=high_confidence_calls_intergration(choppy app) | ##source=high_confidence_calls_intergration(choppy app) | ||||
##reference=GRCh38.d1.vd1 | ##reference=GRCh38.d1.vd1 | ||||
#FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
#FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 for sister consistent, 0 for sister different"> | |||||
#FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation"> | |||||
#FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Int,Description="Depth"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=TWINS,Number=1,Type=Flag,Description="1 for sister consistent, 0 for sister different"> | |||||
##FORMAT=<ID=TRIO5,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation"> | |||||
##FORMAT=<ID=TRIO6,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth"> | |||||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency"> | ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency"> | ||||
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality"> | |||||
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | ##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | ||||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality"> | ##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality"> | ||||
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bia"> | ##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bia"> | ||||
return infoDict | return infoDict | ||||
# | # | ||||
for row in merged_df.itertuples(): | for row in merged_df.itertuples(): | ||||
if row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5 != '.': | |||||
if row[18] != '.': | |||||
# format | # format | ||||
# GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL | # GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL | ||||
FORMAT_x = row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518.split(':') | |||||
FORMAT_x = row[10].split(':') | |||||
ALT = int(FORMAT_x[1].split(',')[1]) | ALT = int(FORMAT_x[1].split(',')[1]) | ||||
if int(FORMAT_x[2]) != 0: | if int(FORMAT_x[2]) != 0: | ||||
AF = round(ALT/int(FORMAT_x[2]),2) | AF = round(ALT/int(FORMAT_x[2]),2) | ||||
INFO_x['QD'] = '.' | INFO_x['QD'] = '.' | ||||
else: | else: | ||||
pass | pass | ||||
FORMAT = row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5 + ':' + FORMAT_x[2] + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) | |||||
FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) | |||||
# outline | # outline | ||||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n' | |||||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n' | |||||
else: | else: | ||||
FORMAT_x = row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518.split(':') | |||||
rawGT = row[10].split(':') | |||||
FORMAT_x = row[10].split(':') | |||||
ALT = int(FORMAT_x[1].split(',')[1]) | ALT = int(FORMAT_x[1].split(',')[1]) | ||||
if int(FORMAT_x[2]) != 0: | if int(FORMAT_x[2]) != 0: | ||||
AF = round(ALT/int(FORMAT_x[2]),2) | AF = round(ALT/int(FORMAT_x[2]),2) | ||||
INFO_x['QD'] = '.' | INFO_x['QD'] = '.' | ||||
else: | else: | ||||
pass | pass | ||||
FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) | |||||
FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0] | |||||
# outline | # outline | ||||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n' | |||||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n' | |||||
outfile.write(outline) | outfile.write(outline) |
import sys,getopt | |||||
import os | |||||
import re | |||||
import fileinput | |||||
def usage(): | |||||
print( | |||||
""" | |||||
Usage: python select_small_variants_supported_by_all_callsets.py -i input_merged_vcf_file -o prefix | |||||
This script selects SNPs and Indels supported by all callsets. | |||||
Input: | |||||
-i a merged vcf file | |||||
Output: | |||||
-o a vcf file containd the selected SNPs and Indels | |||||
""") | |||||
# select supported small variants | |||||
def process(oneLine): | |||||
m = re.match('^\#',oneLine) | |||||
if m is not None: | |||||
outVCF.write(oneLine) | |||||
OUTname.write(oneLine) | |||||
else: | |||||
line = oneLine.rstrip() | |||||
strings = line.strip().split('\t') | |||||
gt = [i.split(':', 1)[0] for i in strings[9:len(strings)]] | |||||
if all(e == gt[0] for e in gt) and (gt[0] != '.'): | |||||
# output the record to vcf | |||||
outVCF.write(oneLine) | |||||
else: | |||||
OUTname.write(oneLine) | |||||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||||
for op,value in opts: | |||||
if op == "-i": | |||||
inputFile=value | |||||
elif op == "-o": | |||||
prefix=value | |||||
elif op == "-h": | |||||
usage() | |||||
sys.exit() | |||||
if len(sys.argv[1:]) < 3: | |||||
usage() | |||||
sys.exit() | |||||
VCFname = prefix + '.vcf' | |||||
OUTname = prefix + '_outlier.vcf' | |||||
outVCF = open(VCFname,'w') | |||||
OUTname = open(OUTname,'w') | |||||
for line in fileinput.input(inputFile): | |||||
process(line) | |||||
outVCF.close() | |||||
OUTname.close() | |||||
from __future__ import division | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import statistics | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to intergeate vcf information, variants quality and location") | |||||
parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True) | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||||
args = parser.parse_args() | |||||
multi_sample_vcf = args.multi_sample_vcf | |||||
prefix = args.prefix | |||||
def get_location(info): | |||||
repeat = '' | |||||
if 'ANN' in info: | |||||
strings = info.strip().split(';') | |||||
for element in strings: | |||||
m = re.match('ANN',element) | |||||
if m is not None: | |||||
repeat = element.split('=')[1] | |||||
else: | |||||
repeat = '.' | |||||
return repeat | |||||
def extract_info_normal(FORMAT,strings): | |||||
GQ = [] | |||||
MQ = [] | |||||
DP = [] | |||||
ALT = [] | |||||
format_strings = FORMAT.split(':') | |||||
for element in strings: | |||||
if element == '.': | |||||
pass | |||||
else: | |||||
element_strings = element.split(':') | |||||
formatDict = dict(zip(format_strings, element_strings)) | |||||
alt = int(formatDict['ALT']) | |||||
dp = int(formatDict['DP']) | |||||
gq = int(formatDict['GQ']) | |||||
mq = float(formatDict['MQ']) | |||||
GQ.append(gq) | |||||
MQ.append(mq) | |||||
DP.append(dp) | |||||
ALT.append(alt) | |||||
DP_a = sum(DP) | |||||
ALT_a = sum(ALT) | |||||
if DP_a == 0: | |||||
AF_m = 'NA' | |||||
else: | |||||
AF_m = float(ALT_a/DP_a) | |||||
GQ_m = statistics.mean(GQ) | |||||
MQ_m = statistics.mean(MQ) | |||||
return AF_m,GQ_m,MQ_m,DP_a,ALT_a | |||||
file_name = prefix + '_variant_quality_location.txt' | |||||
outfile = open(file_name,'w') | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ 'location' + '\t' + 'AF' + '\t' + 'GQ' + '\t' + 'MQ' + '\t' + 'DP' + '\t' + 'ALT' +'\n' | |||||
outfile.write(outputcolumn) | |||||
for line in fileinput.input(multi_sample_vcf): | |||||
m = re.match('^\#',line) | |||||
if m is not None: | |||||
pass | |||||
else: | |||||
line = line.strip() | |||||
strings = line.split('\t') | |||||
repeat = get_location(strings[7]) | |||||
AF,GQ,MQ,DP,ALT = extract_info_normal(strings[8],strings[9:]) | |||||
outLine = '\t'.join(strings) + '\t' + repeat +'\t' + str(AF) + '\t' + str(GQ) + '\t' + str(MQ) + '\t' + str(DP) + '\t' + str(ALT) + '\n' | |||||
outfile.write(outLine) | |||||
from __future__ import division | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import statistics | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to get mapping quality, allele frequency and alternative depth") | |||||
parser.add_argument('-vcf', '--normed_vcf', type=str, help='The VCF file you want to used', required=True) | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||||
args = parser.parse_args() | |||||
normed_vcf = args.normed_vcf | |||||
prefix = args.prefix | |||||
file_name = prefix + '_variant_quality_location.vcf' | |||||
outfile = open(file_name,'w') | |||||
for line in fileinput.input(normed_vcf): | |||||
m = re.match('^\#',line) | |||||
if m is not None: | |||||
outfile.write(line) | |||||
else: | |||||
line = line.strip() | |||||
strings = line.split('\t') | |||||
strings[8] = strings[8] + ':MQ:ALT:AF' | |||||
infos = strings[7].strip().split(';') | |||||
## MQ | |||||
for element in infos: | |||||
m = re.match('MQ=',element) | |||||
if m is not None: | |||||
MQ = element.split('=')[1] | |||||
## ALT | |||||
ad = strings[9].split(':')[1] | |||||
ad_single = ad.split(',') | |||||
ad_single = [int(i) for i in ad_single] | |||||
DP = sum(ad_single) | |||||
if DP != 0: | |||||
ad_single.pop(0) | |||||
ALT = sum(ad_single) | |||||
AF = ALT/DP | |||||
else: | |||||
ALT = 0 | |||||
AF = 'NA' | |||||
outLine = '\t'.join(strings) + ':' + MQ + ':' + str(ALT) + ':' + str(AF) + '\n' | |||||
outfile.write(outLine) | |||||
{ | { | ||||
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | ||||
"{{ project_name }}.LCL6bed_annotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL7bed_annotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL8bed_annotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL7votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL6votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL8mergeInfo.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | "{{ project_name }}.LCL8mergeInfo.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | ||||
"{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | "{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | ||||
"{{ project_name }}.disk_size": "150", | "{{ project_name }}.disk_size": "150", | ||||
"{{ project_name }}.LCL7mergeInfo.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | "{{ project_name }}.LCL7mergeInfo.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | ||||
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | "{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | ||||
"{{ project_name }}.repeat_bed": "oss://pgx-result/renluyao/manuscript/all.repeat.bed", | |||||
"{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | "{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | ||||
"{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL7allInfozipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL7allInfozipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6mergeInfo.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL6mergeInfo.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL5bed_annotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||||
"{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1" | |||||
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.cluster_config": "OnDemand bcs.b4.xlarge img-ubuntu-vpc", | "{{ project_name }}.cluster_config": "OnDemand bcs.b4.xlarge img-ubuntu-vpc", | ||||
"{{ project_name }}.LCL8allInfozipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL8allInfozipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", |
task bed_annotation { | task bed_annotation { | ||||
File merged_vcf | |||||
File merged_vcf_gz | |||||
File merged_vcf_idx | File merged_vcf_idx | ||||
File repeat_bed | File repeat_bed | ||||
String sample | String sample | ||||
command <<< | command <<< | ||||
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz | |||||
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf_gz} -o ${sample}.mendelian.merged.repeatAnno.vcf.gz | |||||
gunzip ${sample}.normed.repeatAnno.vcf.gz | |||||
gunzip ${sample}.mendelian.merged.repeatAnno.vcf.gz | |||||
>>> | >>> | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf" | |||||
File repeat_annotated_vcf = "${sample}.mendelian.merged.repeatAnno.vcf" | |||||
} | } | ||||
} | } |
command <<< | command <<< | ||||
rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.vcf ${sep=" " family_vcf_gz} | |||||
rtg vcfmerge --force-merge-all -o ${sample}.merged.vcf.gz ${sep=" " family_vcf_gz} | |||||
cat ${sample}.merged.vcf | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt | |||||
zcat ${sample}.merged.vcf.gz | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt | |||||
>>> | >>> | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File merged_vcf = "${sample}.merged.vcf" | |||||
File merged_vcf_gz = "${sample}.merged.vcf.gz" | |||||
File merged_vcf_idx = "${sample}.merged.vcf.gz.tbi" | |||||
File vcf_dup = "${sample}.vcf_dup.txt" | File vcf_dup = "${sample}.vcf_dup.txt" | ||||
} | } | ||||
} | } |
task votes { | task votes { | ||||
File merged_vcf | |||||
File repeat_annotated_vcf | |||||
File vcf_dup | File vcf_dup | ||||
String sample | String sample | ||||
String prefix | String prefix | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
python /opt/high_confidence_call_vote.py -vcf ${merged_vcf} -dup ${vcf_dup} -sample ${sample} -prefix ${prefix} | |||||
cat ${prefix}_annotated.vcf | grep -v '##' > ${prefix}.txt | |||||
cat ${prefix}.txt | grep '#CHROM' > header | |||||
for i in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX | |||||
do | |||||
cat ${prefix}.txt | grep -w $i | cat header - > ${sample}.$i.mendelian.txt | |||||
done | |||||
python /opt/high_confidence_call_vote.py -vcf ${repeat_annotated_vcf} -dup ${vcf_dup} -sample ${sample} -prefix ${prefix} | |||||
>>> | >>> | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File annotated_vcf = "${prefix}_annotated.vcf" | |||||
File annotated_txt = "${prefix}.txt" | |||||
Array[File] chromo_votes = glob("*.mendelian.txt") | |||||
File voted_vcf = "${prefix}_voted.vcf" | |||||
File all_sample_vcf = "${prefix}_all_sample_information.vcf" | |||||
} | } | ||||
} | } | ||||
import "./tasks/mergeSister.wdl" as mergeSister | import "./tasks/mergeSister.wdl" as mergeSister | ||||
import "./tasks/reformVCF.wdl" as reformVCF | import "./tasks/reformVCF.wdl" as reformVCF | ||||
import "./tasks/merge.wdl" as merge | import "./tasks/merge.wdl" as merge | ||||
import "./tasks/bed_annotation.wdl" as bed_annotation | |||||
import "./tasks/votes.wdl" as votes | |||||
workflow {{ project_name }} { | workflow {{ project_name }} { | ||||
File inputSamplesFile | File inputSamplesFile | ||||
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | ||||
File ref_dir | File ref_dir | ||||
File repeat_bed | |||||
String fasta | String fasta | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call bed_annotation.bed_annotation as LCL5bed_annotation { | |||||
input: | |||||
merged_vcf_gz=LCL5merge.merged_vcf_gz, | |||||
merged_vcf_idx=LCL5merge.merged_vcf_idx, | |||||
repeat_bed=repeat_bed, | |||||
sample='LCL5', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call votes.votes as LCL5votes { | |||||
input: | |||||
repeat_annotated_vcf=LCL5bed_annotation.repeat_annotated_vcf, | |||||
vcf_dup=LCL5merge.vcf_dup, | |||||
sample='LCL5', | |||||
prefix='LCL5', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call merge.merge as LCL6merge { | call merge.merge as LCL6merge { | ||||
input: | input: | ||||
family_vcf_gz=LCL6allInfozipIndex.vcf_gz, | family_vcf_gz=LCL6allInfozipIndex.vcf_gz, | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call bed_annotation.bed_annotation as LCL6bed_annotation { | |||||
input: | |||||
merged_vcf_gz=LCL6merge.merged_vcf_gz, | |||||
merged_vcf_idx=LCL6merge.merged_vcf_idx, | |||||
repeat_bed=repeat_bed, | |||||
sample='LCL6', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call votes.votes as LCL6votes { | |||||
input: | |||||
repeat_annotated_vcf=LCL6bed_annotation.repeat_annotated_vcf, | |||||
vcf_dup=LCL6merge.vcf_dup, | |||||
sample='LCL6', | |||||
prefix='LCL6', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call merge.merge as LCL7merge { | call merge.merge as LCL7merge { | ||||
input: | input: | ||||
family_vcf_gz=LCL7allInfozipIndex.vcf_gz, | family_vcf_gz=LCL7allInfozipIndex.vcf_gz, | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call bed_annotation.bed_annotation as LCL7bed_annotation { | |||||
input: | |||||
merged_vcf_gz=LCL7merge.merged_vcf_gz, | |||||
merged_vcf_idx=LCL7merge.merged_vcf_idx, | |||||
repeat_bed=repeat_bed, | |||||
sample='LCL7', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call votes.votes as LCL7votes { | |||||
input: | |||||
repeat_annotated_vcf=LCL7bed_annotation.repeat_annotated_vcf, | |||||
vcf_dup=LCL7merge.vcf_dup, | |||||
sample='LCL7', | |||||
prefix='LCL7', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call merge.merge as LCL8merge { | call merge.merge as LCL8merge { | ||||
input: | input: | ||||
family_vcf_gz=LCL8allInfozipIndex.vcf_gz, | family_vcf_gz=LCL8allInfozipIndex.vcf_gz, | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call bed_annotation.bed_annotation as LCL8bed_annotation { | |||||
input: | |||||
merged_vcf_gz=LCL8merge.merged_vcf_gz, | |||||
merged_vcf_idx=LCL8merge.merged_vcf_idx, | |||||
repeat_bed=repeat_bed, | |||||
sample='LCL8', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call votes.votes as LCL8votes { | |||||
input: | |||||
repeat_annotated_vcf=LCL8bed_annotation.repeat_annotated_vcf, | |||||
vcf_dup=LCL8merge.vcf_dup, | |||||
sample='LCL8', | |||||
prefix='LCL8', | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
} | } |