# import modules import sys, argparse, os import fileinput import re import pandas as pd from operator import itemgetter from collections import Counter from itertools import islice # input arguments parser = argparse.ArgumentParser(description="this script is to count voting number") parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True) parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True) parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) args = parser.parse_args() multi_sample_vcf = args.multi_sample_vcf dup_list = args.dup_list prefix = args.prefix sample_name = args.sample_name vcf_header = '''##fileformat=VCFv4.2 ##fileDate=20191224 ##source=high_confidence_calls_intergration(choppy app) ##reference=GRCh38.d1.vd1 ##INFO= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ''' # read in duplication list dup = pd.read_table(dup_list,header=None) var_dup = dup[0].tolist() # output file file_name = prefix + '_annotated.vcf' outfile = open(file_name,'w') # write VCF outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'consensus' + '\n' outfile.write(vcf_header) outfile.write(outputcolumn) #function def vote_percentage(strings): strings = [x.replace('0/0','.') for x in strings] gt = [x.split(':')[0] for x in strings] percentage = round((33 - gt.count('.'))/33,2) return(str(percentage)) def decide_by_rep(strings): consensus_rep = '' mendelian = [x[-5:] for x in strings] strings = [x.replace('.','0/0') for x in strings] gt = [x.split(':')[0] for x in strings] # mendelian consistent? mendelian_dict = Counter(mendelian) highest_mendelian = mendelian_dict.most_common(1) candidate_mendelian = highest_mendelian[0][0] freq_mendelian = highest_mendelian[0][1] if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2): gt_num_dict = Counter(gt) highest_gt = gt_num_dict.most_common(1) candidate_gt = highest_gt[0][0] freq_gt = highest_gt[0][1] if (candidate_gt != '0/0') and (freq_gt >= 2): consensus_rep = candidate_gt elif (candidate_gt == '0/0') and (freq_gt >= 2): consensus_rep = '0/0' else: consensus_rep = 'inconGT' elif (candidate_mendelian == '.') and (freq_mendelian >= 2): consensus_rep = 'noInfo' else: consensus_rep = 'inconMen' return consensus_rep def main(): for line in fileinput.input(multi_sample_vcf): headline = re.match('^\#',line) if headline is not None: pass else: line = line.strip() strings = line.split('\t') variant_id = '_'.join([strings[0],strings[1]]) # check if the variants location is duplicated if variant_id in var_dup: outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\n' outfile.write(outLine) else: # pre-define pcr_consensus = '' pcr_free_consensus = '' consensus_call = '' # pcr pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings) SEQ2000 = decide_by_rep(pcr[0:4]) SEQ500 = decide_by_rep(pcr[4:7]) Nova = decide_by_rep(pcr[7:11]) XTen_ARD = decide_by_rep(pcr[11:14]) XTen_NVG = decide_by_rep(pcr[14:17]) XTen_WUX_1 = decide_by_rep(pcr[17:20]) XTen_WUX_2 = decide_by_rep(pcr[20:23]) sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2] sequence_dict = Counter(sequence_site) highest_sequence = sequence_dict.most_common(1) candidate_sequence = highest_sequence[0][0] freq_sequence = highest_sequence[0][1] if freq_sequence > 4: pcr_consensus = candidate_sequence else: pcr_consensus = 'inconSequenceSite' # pcr-free pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings) SEQ2000 = decide_by_rep(pcr_free[0]) Nova_ARD_1 = decide_by_rep(pcr_free[1:4]) Nova_ARD_2 = decide_by_rep(pcr_free[4:7]) Nova_BRG = decide_by_rep(pcr_free[7:10]) sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG] highest_sequence = sequence_dict.most_common(1) candidate_sequence = highest_sequence[0][0] freq_sequence = highest_sequence[0][1] if freq_sequence > 2: pcr_free_consensus = candidate_sequence else: pcr_free_consensus = 'inconSequenceSite' # pcr and pcr-free tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): consensus_call = pcr_consensus strings[6] = 'reproducible' elif (pcr_consensus in tag) or (pcr_free_consensus in tag): consensus_call = 'filtered' strings[6] = '.' elif (pcr_consensus == '0/0') and (pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0'): consensus_call = 'pcr-free-speicifc' strings[6] = '.' elif (pcr_consensus != '0/0') and (pcr_consensus not in tag) and (pcr_free_consensus == '0/0'): consensus_call = 'pcr-speicifc' strings[6] = '.' elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): consensus_call = 'confirm for parents' strings[6] = '.' else: consensus_call = 'filtered' strings[6] = '.' # percentage percentage = vote_percentage(strings[9:]) strings[7] = 'PCT=' + percentage # output outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\n' outfile.write(outLine) if __name__ == '__main__': main()