|
- # import modules
- import sys, argparse, os
- import fileinput
- import re
- import pandas as pd
- from operator import itemgetter
- from collections import Counter
- from itertools import islice
-
- # input arguments
- parser = argparse.ArgumentParser(description="this script is to count voting number")
-
- parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
- parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
- parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
- parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)
-
- args = parser.parse_args()
- multi_sample_vcf = args.multi_sample_vcf
- dup_list = args.dup_list
- prefix = args.prefix
- sample_name = args.sample_name
-
- vcf_header = '''##fileformat=VCFv4.2
- ##fileDate=20191224
- ##source=high_confidence_calls_intergration(choppy app)
- ##reference=GRCh38.d1.vd1
- ##INFO=<ID=PCT,Number=1,Type=Float,Description="Percentage of votes">
- ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
- ##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
- ##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
- ##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
- ##contig=<ID=chr1,length=248956422>
- ##contig=<ID=chr2,length=242193529>
- ##contig=<ID=chr3,length=198295559>
- ##contig=<ID=chr4,length=190214555>
- ##contig=<ID=chr5,length=181538259>
- ##contig=<ID=chr6,length=170805979>
- ##contig=<ID=chr7,length=159345973>
- ##contig=<ID=chr8,length=145138636>
- ##contig=<ID=chr9,length=138394717>
- ##contig=<ID=chr10,length=133797422>
- ##contig=<ID=chr11,length=135086622>
- ##contig=<ID=chr12,length=133275309>
- ##contig=<ID=chr13,length=114364328>
- ##contig=<ID=chr14,length=107043718>
- ##contig=<ID=chr15,length=101991189>
- ##contig=<ID=chr16,length=90338345>
- ##contig=<ID=chr17,length=83257441>
- ##contig=<ID=chr18,length=80373285>
- ##contig=<ID=chr19,length=58617616>
- ##contig=<ID=chr20,length=64444167>
- ##contig=<ID=chr21,length=46709983>
- ##contig=<ID=chr22,length=50818468>
- ##contig=<ID=chrX,length=156040895>
- '''
-
- # read in duplication list
- dup = pd.read_table(dup_list,header=None)
- var_dup = dup[0].tolist()
-
- # output file
- file_name = prefix + '_annotated.vcf'
- outfile = open(file_name,'w')
-
- # write VCF
- outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'consensus' + '\n'
- outfile.write(vcf_header)
- outfile.write(outputcolumn)
-
- #function
- def vote_percentage(strings):
- strings = [x.replace('0/0','.') for x in strings]
- gt = [x.split(':')[0] for x in strings]
- percentage = round((33 - gt.count('.'))/33,2)
- return(str(percentage))
-
- def decide_by_rep(strings):
- consensus_rep = ''
- mendelian = [x[-5:] for x in strings]
- strings = [x.replace('.','0/0') for x in strings]
- gt = [x.split(':')[0] for x in strings]
- # mendelian consistent?
- mendelian_dict = Counter(mendelian)
- highest_mendelian = mendelian_dict.most_common(1)
- candidate_mendelian = highest_mendelian[0][0]
- freq_mendelian = highest_mendelian[0][1]
- if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
- gt_num_dict = Counter(gt)
- highest_gt = gt_num_dict.most_common(1)
- candidate_gt = highest_gt[0][0]
- freq_gt = highest_gt[0][1]
- if (candidate_gt != '0/0') and (freq_gt >= 2):
- consensus_rep = candidate_gt
- elif (candidate_gt == '0/0') and (freq_gt >= 2):
- consensus_rep = '0/0'
- else:
- consensus_rep = 'inconGT'
- elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
- consensus_rep = 'noInfo'
- else:
- consensus_rep = 'inconMen'
- return consensus_rep
-
-
- def main():
- for line in fileinput.input(multi_sample_vcf):
- headline = re.match('^\#',line)
- if headline is not None:
- pass
- else:
- line = line.strip()
- strings = line.split('\t')
- variant_id = '_'.join([strings[0],strings[1]])
- # check if the variants location is duplicated
- if variant_id in var_dup:
- outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\n'
- outfile.write(outLine)
- else:
- # pre-define
- pcr_consensus = ''
- pcr_free_consensus = ''
- consensus_call = ''
- # pcr
- pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings)
- SEQ2000 = decide_by_rep(pcr[0:4])
- SEQ500 = decide_by_rep(pcr[4:7])
- Nova = decide_by_rep(pcr[7:11])
- XTen_ARD = decide_by_rep(pcr[11:14])
- XTen_NVG = decide_by_rep(pcr[14:17])
- XTen_WUX_1 = decide_by_rep(pcr[17:20])
- XTen_WUX_2 = decide_by_rep(pcr[20:23])
- sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2]
- sequence_dict = Counter(sequence_site)
- highest_sequence = sequence_dict.most_common(1)
- candidate_sequence = highest_sequence[0][0]
- freq_sequence = highest_sequence[0][1]
- if freq_sequence > 4:
- pcr_consensus = candidate_sequence
- else:
- pcr_consensus = 'inconSequenceSite'
- # pcr-free
- pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings)
- SEQ2000 = decide_by_rep(pcr_free[0])
- Nova_ARD_1 = decide_by_rep(pcr_free[1:4])
- Nova_ARD_2 = decide_by_rep(pcr_free[4:7])
- Nova_BRG = decide_by_rep(pcr_free[7:10])
- sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
- highest_sequence = sequence_dict.most_common(1)
- candidate_sequence = highest_sequence[0][0]
- freq_sequence = highest_sequence[0][1]
- if freq_sequence > 2:
- pcr_free_consensus = candidate_sequence
- else:
- pcr_free_consensus = 'inconSequenceSite'
- # pcr and pcr-free
- tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
- if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
- consensus_call = pcr_consensus
- strings[6] = 'reproducible'
- elif (pcr_consensus in tag) or (pcr_free_consensus in tag):
- consensus_call = 'filtered'
- strings[6] = '.'
- elif (pcr_consensus == '0/0') and (pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0'):
- consensus_call = 'pcr-free-speicifc'
- strings[6] = '.'
- elif (pcr_consensus != '0/0') and (pcr_consensus not in tag) and (pcr_free_consensus == '0/0'):
- consensus_call = 'pcr-speicifc'
- strings[6] = '.'
- elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
- consensus_call = 'confirm for parents'
- strings[6] = '.'
- else:
- consensus_call = 'filtered'
- strings[6] = '.'
- # percentage
- percentage = vote_percentage(strings[9:])
- strings[7] = 'PCT=' + percentage
- # output
- outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\n'
- outfile.write(outLine)
-
-
- if __name__ == '__main__':
- main()
-
-
-
-
-
-
-
-
-
-
-
|