You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

256 lines
9.8KB

  1. # import modules
  2. import sys, argparse, os
  3. import fileinput
  4. import re
  5. import pandas as pd
  6. from operator import itemgetter
  7. from collections import Counter
  8. from itertools import islice
  9. from __future__ import division
  10. # input arguments
  11. parser = argparse.ArgumentParser(description="this script is to count voting number")
  12. parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
  13. parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
  14. parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
  15. parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)
  16. args = parser.parse_args()
  17. multi_sample_vcf = args.multi_sample_vcf
  18. dup_list = args.dup_list
  19. prefix = args.prefix
  20. sample_name = args.sample_name
  21. vcf_header = '''##fileformat=VCFv4.2
  22. ##fileDate=20191224
  23. ##source=high_confidence_calls_intergration(choppy app)
  24. ##reference=GRCh38.d1.vd1
  25. ##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes">
  26. ##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes">
  27. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  28. ##contig=<ID=chr1,length=248956422>
  29. ##contig=<ID=chr2,length=242193529>
  30. ##contig=<ID=chr3,length=198295559>
  31. ##contig=<ID=chr4,length=190214555>
  32. ##contig=<ID=chr5,length=181538259>
  33. ##contig=<ID=chr6,length=170805979>
  34. ##contig=<ID=chr7,length=159345973>
  35. ##contig=<ID=chr8,length=145138636>
  36. ##contig=<ID=chr9,length=138394717>
  37. ##contig=<ID=chr10,length=133797422>
  38. ##contig=<ID=chr11,length=135086622>
  39. ##contig=<ID=chr12,length=133275309>
  40. ##contig=<ID=chr13,length=114364328>
  41. ##contig=<ID=chr14,length=107043718>
  42. ##contig=<ID=chr15,length=101991189>
  43. ##contig=<ID=chr16,length=90338345>
  44. ##contig=<ID=chr17,length=83257441>
  45. ##contig=<ID=chr18,length=80373285>
  46. ##contig=<ID=chr19,length=58617616>
  47. ##contig=<ID=chr20,length=64444167>
  48. ##contig=<ID=chr21,length=46709983>
  49. ##contig=<ID=chr22,length=50818468>
  50. ##contig=<ID=chrX,length=156040895>
  51. '''
  52. # read in duplication list
  53. dup = pd.read_table(dup_list,header=None)
  54. var_dup = dup[0].tolist()
  55. # output file
  56. file_name = prefix + '_annotated.vcf'
  57. outfile = open(file_name,'w')
  58. # write VCF
  59. outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n'
  60. outfile.write(vcf_header)
  61. outfile.write(outputcolumn)
  62. #function
  63. def detected_percentage(strings):
  64. strings = [x.replace('0/0','.') for x in strings]
  65. gt = [x.split(':')[0] for x in strings]
  66. percentage = round((27 - gt.count('.'))/27,4)
  67. return(str(percentage))
  68. def vote_percentage(strings,consensus_call):
  69. strings = [x.replace('.','0/0') for x in strings]
  70. gt = [x.split(':')[0] for x in strings]
  71. gt = list(map(gt_uniform,[i for i in gt]))
  72. percentage = round(gt.count(consensus_call)/27,4)
  73. return(str(percentage))
  74. def family_vote(strings,consensus_call):
  75. pass
  76. def gt_uniform(strings):
  77. uniformed_gt = ''
  78. allele1 = strings.split('/')[0]
  79. allele2 = strings.split('/')[1]
  80. if int(allele1) > int(allele2):
  81. uniformed_gt = allele2 + '/' + allele1
  82. else:
  83. uniformed_gt = allele1 + '/' + allele2
  84. return uniformed_gt
  85. def decide_by_rep(strings):
  86. consensus_rep = ''
  87. mendelian = [x[-5:] for x in strings]
  88. strings = [x.replace('.','0/0') for x in strings]
  89. gt = [x.split(':')[0] for x in strings]
  90. # modified gt turn 2/1 to 1/2
  91. gt = list(map(gt_uniform,[i for i in gt]))
  92. # mendelian consistent?
  93. mendelian_dict = Counter(mendelian)
  94. highest_mendelian = mendelian_dict.most_common(1)
  95. candidate_mendelian = highest_mendelian[0][0]
  96. freq_mendelian = highest_mendelian[0][1]
  97. if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
  98. gt_num_dict = Counter(gt)
  99. highest_gt = gt_num_dict.most_common(1)
  100. candidate_gt = highest_gt[0][0]
  101. freq_gt = highest_gt[0][1]
  102. if (candidate_gt != '0/0') and (freq_gt >= 2):
  103. consensus_rep = candidate_gt
  104. elif (candidate_gt == '0/0') and (freq_gt >= 2):
  105. consensus_rep = '0/0'
  106. else:
  107. consensus_rep = 'inconGT'
  108. elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
  109. consensus_rep = 'noInfo'
  110. else:
  111. consensus_rep = 'inconMen'
  112. return consensus_rep
  113. def main():
  114. for line in fileinput.input(multi_sample_vcf):
  115. headline = re.match('^\#',line)
  116. if headline is not None:
  117. pass
  118. else:
  119. line = line.strip()
  120. strings = line.split('\t')
  121. variant_id = '_'.join([strings[0],strings[1]])
  122. # check if the variants location is duplicated
  123. if variant_id in var_dup:
  124. strings[6] = 'dupVar'
  125. outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n'
  126. outfile.write(outLine)
  127. else:
  128. # pre-define
  129. pcr_consensus = ''
  130. pcr_free_consensus = ''
  131. consensus_call = ''
  132. consensus_alt_seq = ''
  133. # pcr
  134. pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings)
  135. SEQ2000 = decide_by_rep(pcr[0:3])
  136. XTen_ARD = decide_by_rep(pcr[3:6])
  137. XTen_NVG = decide_by_rep(pcr[6:9])
  138. XTen_WUX = decide_by_rep(pcr[9:12])
  139. sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
  140. sequence_dict = Counter(sequence_site)
  141. highest_sequence = sequence_dict.most_common(1)
  142. candidate_sequence = highest_sequence[0][0]
  143. freq_sequence = highest_sequence[0][1]
  144. if freq_sequence > 2:
  145. pcr_consensus = candidate_sequence
  146. else:
  147. pcr_consensus = 'inconSequenceSite'
  148. # pcr-free
  149. pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings)
  150. #SEQ2000 = decide_by_rep(pcr_free[0])
  151. T7_WGE = decide_by_rep(pcr_free[0:3])
  152. Nova_ARD_1 = decide_by_rep(pcr_free[3:6])
  153. Nova_ARD_2 = decide_by_rep(pcr_free[6:9])
  154. Nova_BRG = decide_by_rep(pcr_free[9:12])
  155. Nova_WUX = decide_by_rep(pcr_free[12:15])
  156. sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
  157. highest_sequence = sequence_dict.most_common(1)
  158. candidate_sequence = highest_sequence[0][0]
  159. freq_sequence = highest_sequence[0][1]
  160. if freq_sequence > 3:
  161. pcr_free_consensus = candidate_sequence
  162. else:
  163. pcr_free_consensus = 'inconSequenceSite'
  164. # pcr and pcr-free
  165. tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
  166. if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
  167. consensus_call = pcr_consensus
  168. VPCT = vote_percentage(strings[9:],consensus_call)
  169. strings[7] = 'VPCT=' + VPCT
  170. DPCT = detected_percentage(strings[9:])
  171. strings[7] = strings[7] + ';DPCT=' + DPCT
  172. # Delete multiple alternative genotype to necessary expression
  173. strings[6] = 'reproducible'
  174. alt = strings[4]
  175. alt_gt = alt.split(',')
  176. if len(alt_gt) > 1:
  177. allele1 = consensus_call.split('/')[0]
  178. allele2 = consensus_call.split('/')[1]
  179. if allele1 == '0':
  180. allele2_seq = alt_gt[int(allele2) - 1]
  181. consensus_alt_seq = allele2_seq
  182. consensus_call = '0/1'
  183. else:
  184. allele1_seq = alt_gt[int(allele1) - 1]
  185. allele2_seq = alt_gt[int(allele2) - 1]
  186. if int(allele1) > int(allele2):
  187. consensus_alt_seq = allele2_seq + ',' + allele1_seq
  188. consensus_call = '1/2'
  189. elif int(allele1) < int(allele2):
  190. consensus_alt_seq = allele1_seq + ',' + allele2_seq
  191. consensus_call = '1/2'
  192. else:
  193. consensus_alt_seq = allele1_seq
  194. consensus_call = '1/1'
  195. else:
  196. consensus_alt_seq = alt
  197. elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
  198. consensus_call = 'filtered'
  199. strings[6] = 'filtered'
  200. DPCT = detected_percentage(strings[9:])
  201. strings[7] = 'DPCT=' + DPCT
  202. elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
  203. consensus_call = 'pcr-free-speicifc'
  204. strings[6] = 'pcr-free-speicifc'
  205. DPCT = detected_percentage(strings[9:])
  206. strings[7] = 'DPCT=' + DPCT
  207. elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
  208. consensus_call = 'pcr-speicifc'
  209. strings[6] = 'pcr-speicifc'
  210. DPCT = detected_percentage(strings[9:])
  211. strings[7] = 'DPCT=' + DPCT
  212. elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
  213. consensus_call = 'confirm for parents'
  214. strings[6] = 'confirm for parents'
  215. DPCT = detected_percentage(strings[9:])
  216. strings[7] = 'DPCT=' + DPCT
  217. else:
  218. consensus_call = 'filtered'
  219. strings[6] = 'filtered'
  220. DPCT = detected_percentage(strings[9:])
  221. strings[7] = 'DPCT=' + DPCT
  222. # output
  223. outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n'
  224. outfile.write(outLine)
  225. if __name__ == '__main__':
  226. main()