You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

255 lines
9.8KB

  1. # import modules
  2. import sys, argparse, os
  3. import fileinput
  4. import re
  5. import pandas as pd
  6. from operator import itemgetter
  7. from collections import Counter
  8. from itertools import islice
  9. # input arguments
  10. parser = argparse.ArgumentParser(description="this script is to count voting number")
  11. parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
  12. parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
  13. parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
  14. parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)
  15. args = parser.parse_args()
  16. multi_sample_vcf = args.multi_sample_vcf
  17. dup_list = args.dup_list
  18. prefix = args.prefix
  19. sample_name = args.sample_name
  20. vcf_header = '''##fileformat=VCFv4.2
  21. ##fileDate=20191224
  22. ##source=high_confidence_calls_intergration(choppy app)
  23. ##reference=GRCh38.d1.vd1
  24. ##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes">
  25. ##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes">
  26. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  27. ##contig=<ID=chr1,length=248956422>
  28. ##contig=<ID=chr2,length=242193529>
  29. ##contig=<ID=chr3,length=198295559>
  30. ##contig=<ID=chr4,length=190214555>
  31. ##contig=<ID=chr5,length=181538259>
  32. ##contig=<ID=chr6,length=170805979>
  33. ##contig=<ID=chr7,length=159345973>
  34. ##contig=<ID=chr8,length=145138636>
  35. ##contig=<ID=chr9,length=138394717>
  36. ##contig=<ID=chr10,length=133797422>
  37. ##contig=<ID=chr11,length=135086622>
  38. ##contig=<ID=chr12,length=133275309>
  39. ##contig=<ID=chr13,length=114364328>
  40. ##contig=<ID=chr14,length=107043718>
  41. ##contig=<ID=chr15,length=101991189>
  42. ##contig=<ID=chr16,length=90338345>
  43. ##contig=<ID=chr17,length=83257441>
  44. ##contig=<ID=chr18,length=80373285>
  45. ##contig=<ID=chr19,length=58617616>
  46. ##contig=<ID=chr20,length=64444167>
  47. ##contig=<ID=chr21,length=46709983>
  48. ##contig=<ID=chr22,length=50818468>
  49. ##contig=<ID=chrX,length=156040895>
  50. '''
  51. # read in duplication list
  52. dup = pd.read_table(dup_list,header=None)
  53. var_dup = dup[0].tolist()
  54. # output file
  55. file_name = prefix + '_annotated.vcf'
  56. outfile = open(file_name,'w')
  57. # write VCF
  58. outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n'
  59. outfile.write(vcf_header)
  60. outfile.write(outputcolumn)
  61. #function
  62. def detected_percentage(strings):
  63. strings = [x.replace('0/0','.') for x in strings]
  64. gt = [x.split(':')[0] for x in strings]
  65. percentage = round((27 - gt.count('.'))/27,4)
  66. return(str(percentage))
  67. def vote_percentage(strings,consensus_call):
  68. strings = [x.replace('.','0/0') for x in strings]
  69. gt = [x.split(':')[0] for x in strings]
  70. gt = list(map(gt_uniform,[i for i in gt]))
  71. percentage = round(gt.count(consensus_call)/27,4)
  72. return(str(percentage))
  73. def family_vote(strings,consensus_call):
  74. pass
  75. def gt_uniform(strings):
  76. uniformed_gt = ''
  77. allele1 = strings.split('/')[0]
  78. allele2 = strings.split('/')[1]
  79. if int(allele1) > int(allele2):
  80. uniformed_gt = allele2 + '/' + allele1
  81. else:
  82. uniformed_gt = allele1 + '/' + allele2
  83. return uniformed_gt
  84. def decide_by_rep(strings):
  85. consensus_rep = ''
  86. mendelian = [x[-5:] for x in strings]
  87. strings = [x.replace('.','0/0') for x in strings]
  88. gt = [x.split(':')[0] for x in strings]
  89. # modified gt turn 2/1 to 1/2
  90. gt = list(map(gt_uniform,[i for i in gt]))
  91. # mendelian consistent?
  92. mendelian_dict = Counter(mendelian)
  93. highest_mendelian = mendelian_dict.most_common(1)
  94. candidate_mendelian = highest_mendelian[0][0]
  95. freq_mendelian = highest_mendelian[0][1]
  96. if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
  97. gt_num_dict = Counter(gt)
  98. highest_gt = gt_num_dict.most_common(1)
  99. candidate_gt = highest_gt[0][0]
  100. freq_gt = highest_gt[0][1]
  101. if (candidate_gt != '0/0') and (freq_gt >= 2):
  102. consensus_rep = candidate_gt
  103. elif (candidate_gt == '0/0') and (freq_gt >= 2):
  104. consensus_rep = '0/0'
  105. else:
  106. consensus_rep = 'inconGT'
  107. elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
  108. consensus_rep = 'noInfo'
  109. else:
  110. consensus_rep = 'inconMen'
  111. return consensus_rep
  112. def main():
  113. for line in fileinput.input(multi_sample_vcf):
  114. headline = re.match('^\#',line)
  115. if headline is not None:
  116. pass
  117. else:
  118. line = line.strip()
  119. strings = line.split('\t')
  120. variant_id = '_'.join([strings[0],strings[1]])
  121. # check if the variants location is duplicated
  122. if variant_id in var_dup:
  123. strings[6] = 'dupVar'
  124. outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n'
  125. outfile.write(outLine)
  126. else:
  127. # pre-define
  128. pcr_consensus = ''
  129. pcr_free_consensus = ''
  130. consensus_call = ''
  131. consensus_alt_seq = ''
  132. # pcr
  133. pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings)
  134. SEQ2000 = decide_by_rep(pcr[0:3])
  135. XTen_ARD = decide_by_rep(pcr[3:6])
  136. XTen_NVG = decide_by_rep(pcr[6:9])
  137. XTen_WUX = decide_by_rep(pcr[9:12])
  138. sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
  139. sequence_dict = Counter(sequence_site)
  140. highest_sequence = sequence_dict.most_common(1)
  141. candidate_sequence = highest_sequence[0][0]
  142. freq_sequence = highest_sequence[0][1]
  143. if freq_sequence > 2:
  144. pcr_consensus = candidate_sequence
  145. else:
  146. pcr_consensus = 'inconSequenceSite'
  147. # pcr-free
  148. pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings)
  149. #SEQ2000 = decide_by_rep(pcr_free[0])
  150. T7_WGE = decide_by_rep(pcr_free[0:3])
  151. Nova_ARD_1 = decide_by_rep(pcr_free[3:6])
  152. Nova_ARD_2 = decide_by_rep(pcr_free[6:9])
  153. Nova_BRG = decide_by_rep(pcr_free[9:12])
  154. Nova_WUX = decide_by_rep(pcr_free[12:15])
  155. sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
  156. highest_sequence = sequence_dict.most_common(1)
  157. candidate_sequence = highest_sequence[0][0]
  158. freq_sequence = highest_sequence[0][1]
  159. if freq_sequence > 3:
  160. pcr_free_consensus = candidate_sequence
  161. else:
  162. pcr_free_consensus = 'inconSequenceSite'
  163. # pcr and pcr-free
  164. tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
  165. if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
  166. consensus_call = pcr_consensus
  167. VPCT = vote_percentage(strings[9:],consensus_call)
  168. strings[7] = 'VPCT=' + VPCT
  169. DPCT = detected_percentage(strings[9:])
  170. strings[7] = strings[7] + ';DPCT=' + DPCT
  171. # Delete multiple alternative genotype to necessary expression
  172. strings[6] = 'reproducible'
  173. alt = strings[4]
  174. alt_gt = alt.split(',')
  175. if len(alt_gt) > 1:
  176. allele1 = consensus_call.split('/')[0]
  177. allele2 = consensus_call.split('/')[1]
  178. if allele1 == '0':
  179. allele2_seq = alt_gt[int(allele2) - 1]
  180. consensus_alt_seq = allele2_seq
  181. consensus_call = '0/1'
  182. else:
  183. allele1_seq = alt_gt[int(allele1) - 1]
  184. allele2_seq = alt_gt[int(allele2) - 1]
  185. if int(allele1) > int(allele2):
  186. consensus_alt_seq = allele2_seq + ',' + allele1_seq
  187. consensus_call = '1/2'
  188. elif int(allele1) < int(allele2):
  189. consensus_alt_seq = allele1_seq + ',' + allele2_seq
  190. consensus_call = '1/2'
  191. else:
  192. consensus_alt_seq = allele1_seq
  193. consensus_call = '1/1'
  194. else:
  195. consensus_alt_seq = alt
  196. elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
  197. consensus_call = 'filtered'
  198. strings[6] = 'filtered'
  199. DPCT = detected_percentage(strings[9:])
  200. strings[7] = 'DPCT=' + DPCT
  201. elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
  202. consensus_call = 'pcr-free-speicifc'
  203. strings[6] = 'pcr-free-speicifc'
  204. DPCT = detected_percentage(strings[9:])
  205. strings[7] = 'DPCT=' + DPCT
  206. elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
  207. consensus_call = 'pcr-speicifc'
  208. strings[6] = 'pcr-speicifc'
  209. DPCT = detected_percentage(strings[9:])
  210. strings[7] = 'DPCT=' + DPCT
  211. elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
  212. consensus_call = 'confirm for parents'
  213. strings[6] = 'confirm for parents'
  214. DPCT = detected_percentage(strings[9:])
  215. strings[7] = 'DPCT=' + DPCT
  216. else:
  217. consensus_call = 'filtered'
  218. strings[6] = 'filtered'
  219. DPCT = detected_percentage(strings[9:])
  220. strings[7] = 'DPCT=' + DPCT
  221. # output
  222. outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n'
  223. outfile.write(outLine)
  224. if __name__ == '__main__':
  225. main()