You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

high_confidence_call_vote.py 10KB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. from __future__ import division
  2. import sys, argparse, os
  3. import fileinput
  4. import re
  5. import pandas as pd
  6. from operator import itemgetter
  7. from collections import Counter
  8. from itertools import islice
  9. # input arguments
  10. parser = argparse.ArgumentParser(description="this script is to count voting number")
  11. parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
  12. parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
  13. parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
  14. parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)
  15. args = parser.parse_args()
  16. multi_sample_vcf = args.multi_sample_vcf
  17. dup_list = args.dup_list
  18. prefix = args.prefix
  19. sample_name = args.sample_name
  20. vcf_header = '''##fileformat=VCFv4.2
  21. ##fileDate=20191224
  22. ##source=high_confidence_calls_intergration(choppy app)
  23. ##reference=GRCh38.d1.vd1
  24. ##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes">
  25. ##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes">
  26. ##INFO=<ID=FPCT,Number=1,Type=Float,Description="Percentage of mendelian consisitent votes">
  27. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  28. ##contig=<ID=chr1,length=248956422>
  29. ##contig=<ID=chr2,length=242193529>
  30. ##contig=<ID=chr3,length=198295559>
  31. ##contig=<ID=chr4,length=190214555>
  32. ##contig=<ID=chr5,length=181538259>
  33. ##contig=<ID=chr6,length=170805979>
  34. ##contig=<ID=chr7,length=159345973>
  35. ##contig=<ID=chr8,length=145138636>
  36. ##contig=<ID=chr9,length=138394717>
  37. ##contig=<ID=chr10,length=133797422>
  38. ##contig=<ID=chr11,length=135086622>
  39. ##contig=<ID=chr12,length=133275309>
  40. ##contig=<ID=chr13,length=114364328>
  41. ##contig=<ID=chr14,length=107043718>
  42. ##contig=<ID=chr15,length=101991189>
  43. ##contig=<ID=chr16,length=90338345>
  44. ##contig=<ID=chr17,length=83257441>
  45. ##contig=<ID=chr18,length=80373285>
  46. ##contig=<ID=chr19,length=58617616>
  47. ##contig=<ID=chr20,length=64444167>
  48. ##contig=<ID=chr21,length=46709983>
  49. ##contig=<ID=chr22,length=50818468>
  50. ##contig=<ID=chrX,length=156040895>
  51. '''
  52. # read in duplication list
  53. dup = pd.read_table(dup_list,header=None)
  54. var_dup = dup[0].tolist()
  55. # output file
  56. file_name = prefix + '_annotated.vcf'
  57. outfile = open(file_name,'w')
  58. # write VCF
  59. outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n'
  60. outfile.write(vcf_header)
  61. outfile.write(outputcolumn)
  62. #function
  63. def detected_percentage(strings):
  64. strings = [x.replace('0/0','.') for x in strings]
  65. gt = [x.split(':')[0] for x in strings]
  66. percentage = round((27 - gt.count('.'))/27,4)
  67. return(str(percentage))
  68. def vote_percentage(strings,consensus_call):
  69. strings = [x.replace('.','0/0') for x in strings]
  70. gt = [x.split(':')[0] for x in strings]
  71. gt = list(map(gt_uniform,[i for i in gt]))
  72. percentage = round(gt.count(consensus_call)/27,4)
  73. return(str(percentage))
  74. def family_vote(strings,consensus_call):
  75. strings = [x.replace('.','0/0') for x in strings]
  76. gt = [x.split(':')[0] for x in strings]
  77. gt = list(map(gt_uniform,[i for i in gt]))
  78. mendelian = [x[-5:] for x in strings]
  79. indices = [i for i, x in enumerate(gt) if x == consensus_call]
  80. matched_mendelian = itemgetter(*indices)(mendelian)
  81. percentage = round(matched_mendelian.count('1:1:1')/27,4)
  82. return(str(percentage))
  83. def gt_uniform(strings):
  84. uniformed_gt = ''
  85. allele1 = strings.split('/')[0]
  86. allele2 = strings.split('/')[1]
  87. if int(allele1) > int(allele2):
  88. uniformed_gt = allele2 + '/' + allele1
  89. else:
  90. uniformed_gt = allele1 + '/' + allele2
  91. return uniformed_gt
  92. def decide_by_rep(strings):
  93. consensus_rep = ''
  94. mendelian = [x[-5:] for x in strings]
  95. strings = [x.replace('.','0/0') for x in strings]
  96. gt = [x.split(':')[0] for x in strings]
  97. # modified gt turn 2/1 to 1/2
  98. gt = list(map(gt_uniform,[i for i in gt]))
  99. # mendelian consistent?
  100. mendelian_dict = Counter(mendelian)
  101. highest_mendelian = mendelian_dict.most_common(1)
  102. candidate_mendelian = highest_mendelian[0][0]
  103. freq_mendelian = highest_mendelian[0][1]
  104. if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
  105. gt_num_dict = Counter(gt)
  106. highest_gt = gt_num_dict.most_common(1)
  107. candidate_gt = highest_gt[0][0]
  108. freq_gt = highest_gt[0][1]
  109. if (candidate_gt != '0/0') and (freq_gt >= 2):
  110. consensus_rep = candidate_gt
  111. elif (candidate_gt == '0/0') and (freq_gt >= 2):
  112. consensus_rep = '0/0'
  113. else:
  114. consensus_rep = 'inconGT'
  115. elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
  116. consensus_rep = 'noInfo'
  117. else:
  118. consensus_rep = 'inconMen'
  119. return consensus_rep
  120. def main():
  121. for line in fileinput.input(multi_sample_vcf):
  122. headline = re.match('^\#',line)
  123. if headline is not None:
  124. pass
  125. else:
  126. line = line.strip()
  127. strings = line.split('\t')
  128. variant_id = '_'.join([strings[0],strings[1]])
  129. # check if the variants location is duplicated
  130. if variant_id in var_dup:
  131. strings[6] = 'dupVar'
  132. outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n'
  133. outfile.write(outLine)
  134. else:
  135. # pre-define
  136. pcr_consensus = '.'
  137. pcr_free_consensus = '.'
  138. consensus_call = '.'
  139. consensus_alt_seq = '.'
  140. # pcr
  141. pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings)
  142. SEQ2000 = decide_by_rep(pcr[0:3])
  143. XTen_ARD = decide_by_rep(pcr[3:6])
  144. XTen_NVG = decide_by_rep(pcr[6:9])
  145. XTen_WUX = decide_by_rep(pcr[9:12])
  146. sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
  147. sequence_dict = Counter(sequence_site)
  148. highest_sequence = sequence_dict.most_common(1)
  149. candidate_sequence = highest_sequence[0][0]
  150. freq_sequence = highest_sequence[0][1]
  151. if freq_sequence > 2:
  152. pcr_consensus = candidate_sequence
  153. else:
  154. pcr_consensus = 'inconSequenceSite'
  155. # pcr-free
  156. pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings)
  157. #SEQ2000 = decide_by_rep(pcr_free[0])
  158. T7_WGE = decide_by_rep(pcr_free[0:3])
  159. Nova_ARD_1 = decide_by_rep(pcr_free[3:6])
  160. Nova_ARD_2 = decide_by_rep(pcr_free[6:9])
  161. Nova_BRG = decide_by_rep(pcr_free[9:12])
  162. Nova_WUX = decide_by_rep(pcr_free[12:15])
  163. sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
  164. highest_sequence = sequence_dict.most_common(1)
  165. candidate_sequence = highest_sequence[0][0]
  166. freq_sequence = highest_sequence[0][1]
  167. if freq_sequence > 3:
  168. pcr_free_consensus = candidate_sequence
  169. else:
  170. pcr_free_consensus = 'inconSequenceSite'
  171. # pcr and pcr-free
  172. tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
  173. if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
  174. consensus_call = pcr_consensus
  175. VPCT = vote_percentage(strings[9:],consensus_call)
  176. strings[7] = 'VPCT=' + VPCT
  177. DPCT = detected_percentage(strings[9:])
  178. strings[7] = strings[7] + ';DPCT=' + DPCT
  179. FPCT = family_vote(strings[9:],consensus_call)
  180. strings[7] = strings[7] + ';FPCT=' + FPCT
  181. # Delete multiple alternative genotype to necessary expression
  182. strings[6] = 'reproducible'
  183. alt = strings[4]
  184. alt_gt = alt.split(',')
  185. if len(alt_gt) > 1:
  186. allele1 = consensus_call.split('/')[0]
  187. allele2 = consensus_call.split('/')[1]
  188. if allele1 == '0':
  189. allele2_seq = alt_gt[int(allele2) - 1]
  190. consensus_alt_seq = allele2_seq
  191. consensus_call = '0/1'
  192. else:
  193. allele1_seq = alt_gt[int(allele1) - 1]
  194. allele2_seq = alt_gt[int(allele2) - 1]
  195. if int(allele1) > int(allele2):
  196. consensus_alt_seq = allele2_seq + ',' + allele1_seq
  197. consensus_call = '1/2'
  198. elif int(allele1) < int(allele2):
  199. consensus_alt_seq = allele1_seq + ',' + allele2_seq
  200. consensus_call = '1/2'
  201. else:
  202. consensus_alt_seq = allele1_seq
  203. consensus_call = '1/1'
  204. else:
  205. consensus_alt_seq = alt
  206. elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
  207. consensus_call = 'filtered'
  208. strings[6] = 'filtered'
  209. DPCT = detected_percentage(strings[9:])
  210. strings[7] = 'DPCT=' + DPCT
  211. elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
  212. consensus_call = 'pcr-free-speicifc'
  213. strings[6] = 'pcr-free-speicifc'
  214. DPCT = detected_percentage(strings[9:])
  215. strings[7] = 'DPCT=' + DPCT
  216. elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
  217. consensus_call = 'pcr-speicifc'
  218. strings[6] = 'pcr-speicifc'
  219. DPCT = detected_percentage(strings[9:])
  220. strings[7] = 'DPCT=' + DPCT
  221. elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
  222. consensus_call = 'confirm for parents'
  223. strings[6] = 'confirm for parents'
  224. DPCT = detected_percentage(strings[9:])
  225. strings[7] = 'DPCT=' + DPCT
  226. else:
  227. consensus_call = 'filtered'
  228. strings[6] = 'filtered'
  229. DPCT = detected_percentage(strings[9:])
  230. strings[7] = 'DPCT=' + DPCT
  231. # output
  232. outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n'
  233. outfile.write(outLine)
  234. if __name__ == '__main__':
  235. main()