renluyao
/
high_confidence_calls_manuscript


			
							# import modules
import sys, argparse, os
import fileinput
import re
import pandas as pd
from operator import itemgetter
from collections import Counter
from itertools import islice  

# input arguments
parser = argparse.ArgumentParser(description="this script is to count voting number")

parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number',  required=True)
parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list',  required=True)
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet',  required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name',  required=True)

args = parser.parse_args()
multi_sample_vcf = args.multi_sample_vcf
dup_list = args.dup_list
prefix = args.prefix
sample_name = args.sample_name

vcf_header = '''##fileformat=VCFv4.2
##fileDate=20191224
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes">
##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''

# read in duplication list
dup = pd.read_table(dup_list,header=None)
var_dup = dup[0].tolist()

# output file
file_name = prefix + '_annotated.vcf'
outfile = open(file_name,'w')

# write VCF
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n'
outfile.write(vcf_header)
outfile.write(outputcolumn)

#function

def detected_percentage(strings):
	strings = [x.replace('0/0','.') for x in strings]
	gt = [x.split(':')[0] for x in strings]
	percentage = round((27 - gt.count('.'))/27,4)
	return(str(percentage))

def vote_percentage(strings,consensus_call):
	strings = [x.replace('.','0/0') for x in strings]
	gt = [x.split(':')[0] for x in strings]
	gt = list(map(gt_uniform,[i for i in gt]))
	percentage = round(gt.count(consensus_call)/27,4)
	return(str(percentage))

def family_vote(strings,consensus_call):
	pass

def gt_uniform(strings):
	uniformed_gt = ''
	allele1 = strings.split('/')[0]
	allele2 = strings.split('/')[1]
	if int(allele1) > int(allele2):
		uniformed_gt = allele2 + '/' + allele1
	else:
		uniformed_gt = allele1 + '/' + allele2
	return uniformed_gt

def decide_by_rep(strings):
	consensus_rep = ''
	mendelian = [x[-5:] for x in strings]
	strings = [x.replace('.','0/0') for x in strings]
	gt = [x.split(':')[0] for x in strings]
	# modified gt turn 2/1 to 1/2
	gt = list(map(gt_uniform,[i for i in gt]))
	# mendelian consistent?
	mendelian_dict = Counter(mendelian)
	highest_mendelian = mendelian_dict.most_common(1)
	candidate_mendelian = highest_mendelian[0][0]
	freq_mendelian = highest_mendelian[0][1]
	if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
		gt_num_dict = Counter(gt)
		highest_gt = gt_num_dict.most_common(1)
		candidate_gt = highest_gt[0][0]
		freq_gt = highest_gt[0][1]
		if (candidate_gt != '0/0') and (freq_gt >= 2):
			consensus_rep = candidate_gt
		elif (candidate_gt == '0/0') and (freq_gt >= 2):
			consensus_rep = '0/0'
		else:
			consensus_rep = 'inconGT'
	elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
		consensus_rep = 'noInfo'
	else:
		consensus_rep = 'inconMen'
	return consensus_rep


def main():
	for line in fileinput.input(multi_sample_vcf):
		headline = re.match('^\#',line)
		if headline is not None:
			pass
		else:
			line = line.strip()
			strings = line.split('\t')
			variant_id = '_'.join([strings[0],strings[1]])
			# check if the variants location is duplicated
			if variant_id in var_dup:
				strings[6] = 'dupVar'
				outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n'
				outfile.write(outLine)
			else:
				# pre-define
				pcr_consensus = ''
				pcr_free_consensus = ''
				consensus_call = ''
				consensus_alt_seq = ''
				# pcr 
				pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings)
				SEQ2000 = decide_by_rep(pcr[0:3])
				XTen_ARD = decide_by_rep(pcr[3:6])
				XTen_NVG = decide_by_rep(pcr[6:9])
				XTen_WUX = decide_by_rep(pcr[9:12])
				sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
				sequence_dict = Counter(sequence_site)
				highest_sequence = sequence_dict.most_common(1)
				candidate_sequence = highest_sequence[0][0]
				freq_sequence = highest_sequence[0][1]
				if freq_sequence > 2:
					pcr_consensus = candidate_sequence
				else:
					pcr_consensus = 'inconSequenceSite'
				# pcr-free
				pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings)
				#SEQ2000 = decide_by_rep(pcr_free[0])
				T7_WGE = decide_by_rep(pcr_free[0:3])
				Nova_ARD_1 = decide_by_rep(pcr_free[3:6])
				Nova_ARD_2 = decide_by_rep(pcr_free[6:9])
				Nova_BRG = decide_by_rep(pcr_free[9:12])
				Nova_WUX = decide_by_rep(pcr_free[12:15])
				sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
				highest_sequence = sequence_dict.most_common(1)
				candidate_sequence = highest_sequence[0][0]
				freq_sequence = highest_sequence[0][1]
				if freq_sequence > 3:
					pcr_free_consensus = candidate_sequence
				else:
					pcr_free_consensus = 'inconSequenceSite'
				# pcr and pcr-free
				tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
				if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
					consensus_call = pcr_consensus
					VPCT = vote_percentage(strings[9:],consensus_call)
					strings[7] = 'VPCT=' + VPCT
					DPCT = detected_percentage(strings[9:])
					strings[7] = strings[7] + ';DPCT=' + DPCT
					# Delete multiple alternative genotype to necessary expression
					strings[6] = 'reproducible'
					alt = strings[4]
					alt_gt = alt.split(',')
					if len(alt_gt) > 1:
						allele1 = consensus_call.split('/')[0]
						allele2 = consensus_call.split('/')[1]
						if allele1 == '0':
							allele2_seq = alt_gt[int(allele2) - 1]
							consensus_alt_seq = allele2_seq
							consensus_call = '0/1'
						else:
							allele1_seq = alt_gt[int(allele1) - 1]
							allele2_seq = alt_gt[int(allele2) - 1]
							if int(allele1) > int(allele2):
								consensus_alt_seq = allele2_seq + ',' + allele1_seq
								consensus_call = '1/2'
							elif int(allele1) < int(allele2):
								consensus_alt_seq = allele1_seq + ',' + allele2_seq
								consensus_call = '1/2'
							else:
								consensus_alt_seq = allele1_seq 
								consensus_call = '1/1'
					else:
						consensus_alt_seq = alt
				elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
					consensus_call = 'filtered'
					strings[6] = 'filtered'
					DPCT = detected_percentage(strings[9:])
					strings[7] = 'DPCT=' + DPCT
				elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
					consensus_call = 'pcr-free-speicifc'
					strings[6] = 'pcr-free-speicifc'
					DPCT = detected_percentage(strings[9:])
					strings[7] = 'DPCT=' + DPCT
				elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
					consensus_call = 'pcr-speicifc'
					strings[6] = 'pcr-speicifc'
					DPCT = detected_percentage(strings[9:])
					strings[7] = 'DPCT=' + DPCT
				elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
					consensus_call = 'confirm for parents'
					strings[6] = 'confirm for parents'					
					DPCT = detected_percentage(strings[9:])
					strings[7] = 'DPCT=' + DPCT
				else:
					consensus_call = 'filtered'
					strings[6] = 'filtered'
					DPCT = detected_percentage(strings[9:])
					strings[7] = 'DPCT=' + DPCT
				# output 
				outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n'
				outfile.write(outLine)


if __name__ == '__main__':
	main()