Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

256 lines
10KB

  1. # import modules
  2. import sys, argparse, os
  3. import fileinput
  4. import re
  5. import pandas as pd
  6. from operator import itemgetter
  7. from collections import Counter
  8. from itertools import islice
  9. # input arguments
  10. parser = argparse.ArgumentParser(description="this script is to count voting number")
  11. parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
  12. parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
  13. parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
  14. parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)
  15. args = parser.parse_args()
  16. multi_sample_vcf = args.multi_sample_vcf
  17. dup_list = args.dup_list
  18. prefix = args.prefix
  19. sample_name = args.sample_name
  20. vcf_header = '''##fileformat=VCFv4.2
  21. ##fileDate=20191224
  22. ##source=high_confidence_calls_intergration(choppy app)
  23. ##reference=GRCh38.d1.vd1
  24. ##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes">
  25. ##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes">
  26. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  27. ##contig=<ID=chr1,length=248956422>
  28. ##contig=<ID=chr2,length=242193529>
  29. ##contig=<ID=chr3,length=198295559>
  30. ##contig=<ID=chr4,length=190214555>
  31. ##contig=<ID=chr5,length=181538259>
  32. ##contig=<ID=chr6,length=170805979>
  33. ##contig=<ID=chr7,length=159345973>
  34. ##contig=<ID=chr8,length=145138636>
  35. ##contig=<ID=chr9,length=138394717>
  36. ##contig=<ID=chr10,length=133797422>
  37. ##contig=<ID=chr11,length=135086622>
  38. ##contig=<ID=chr12,length=133275309>
  39. ##contig=<ID=chr13,length=114364328>
  40. ##contig=<ID=chr14,length=107043718>
  41. ##contig=<ID=chr15,length=101991189>
  42. ##contig=<ID=chr16,length=90338345>
  43. ##contig=<ID=chr17,length=83257441>
  44. ##contig=<ID=chr18,length=80373285>
  45. ##contig=<ID=chr19,length=58617616>
  46. ##contig=<ID=chr20,length=64444167>
  47. ##contig=<ID=chr21,length=46709983>
  48. ##contig=<ID=chr22,length=50818468>
  49. ##contig=<ID=chrX,length=156040895>
  50. '''
  51. # read in duplication list
  52. dup = pd.read_table(dup_list,header=None)
  53. var_dup = dup[0].tolist()
  54. # output file
  55. file_name = prefix + '_annotated.vcf'
  56. outfile = open(file_name,'w')
  57. # write VCF
  58. outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n'
  59. outfile.write(vcf_header)
  60. outfile.write(outputcolumn)
  61. #function
  62. def detected_percentage(strings):
  63. strings = [x.replace('0/0','.') for x in strings]
  64. gt = [x.split(':')[0] for x in strings]
  65. percentage = round((33 - gt.count('.'))/33,4)
  66. return(str(percentage))
  67. def vote_percentage(strings,consensus_call):
  68. strings = [x.replace('.','0/0') for x in strings]
  69. gt = [x.split(':')[0] for x in strings]
  70. gt = list(map(gt_uniform,[i for i in gt]))
  71. percentage = round(gt.count(consensus_call)/33,4)
  72. return(str(percentage))
  73. def family_vote(strings,consensus_call):
  74. pass
  75. def gt_uniform(strings):
  76. uniformed_gt = ''
  77. allele1 = strings.split('/')[0]
  78. allele2 = strings.split('/')[1]
  79. if int(allele1) > int(allele2):
  80. uniformed_gt = allele2 + '/' + allele1
  81. else:
  82. uniformed_gt = allele1 + '/' + allele2
  83. return uniformed_gt
  84. def decide_by_rep(strings):
  85. consensus_rep = ''
  86. mendelian = [x[-5:] for x in strings]
  87. strings = [x.replace('.','0/0') for x in strings]
  88. gt = [x.split(':')[0] for x in strings]
  89. # modified gt turn 2/1 to 1/2
  90. gt = list(map(gt_uniform,[i for i in gt]))
  91. # mendelian consistent?
  92. mendelian_dict = Counter(mendelian)
  93. highest_mendelian = mendelian_dict.most_common(1)
  94. candidate_mendelian = highest_mendelian[0][0]
  95. freq_mendelian = highest_mendelian[0][1]
  96. if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
  97. gt_num_dict = Counter(gt)
  98. highest_gt = gt_num_dict.most_common(1)
  99. candidate_gt = highest_gt[0][0]
  100. freq_gt = highest_gt[0][1]
  101. if (candidate_gt != '0/0') and (freq_gt >= 2):
  102. consensus_rep = candidate_gt
  103. elif (candidate_gt == '0/0') and (freq_gt >= 2):
  104. consensus_rep = '0/0'
  105. else:
  106. consensus_rep = 'inconGT'
  107. elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
  108. consensus_rep = 'noInfo'
  109. else:
  110. consensus_rep = 'inconMen'
  111. return consensus_rep
  112. def main():
  113. for line in fileinput.input(multi_sample_vcf):
  114. headline = re.match('^\#',line)
  115. if headline is not None:
  116. pass
  117. else:
  118. line = line.strip()
  119. strings = line.split('\t')
  120. variant_id = '_'.join([strings[0],strings[1]])
  121. # check if the variants location is duplicated
  122. if variant_id in var_dup:
  123. strings[6] = 'dupVar'
  124. outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n'
  125. outfile.write(outLine)
  126. else:
  127. # pre-define
  128. pcr_consensus = ''
  129. pcr_free_consensus = ''
  130. consensus_call = ''
  131. consensus_alt_seq = '.'
  132. # pcr
  133. pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings)
  134. SEQ2000 = decide_by_rep(pcr[0:3])
  135. SEQ500 = decide_by_rep(pcr[4:7])
  136. Nova = decide_by_rep(pcr[7:11])
  137. XTen_ARD = decide_by_rep(pcr[11:14])
  138. XTen_NVG = decide_by_rep(pcr[14:17])
  139. XTen_WUX_1 = decide_by_rep(pcr[17:20])
  140. XTen_WUX_2 = decide_by_rep(pcr[20:23])
  141. sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2]
  142. sequence_dict = Counter(sequence_site)
  143. highest_sequence = sequence_dict.most_common(1)
  144. candidate_sequence = highest_sequence[0][0]
  145. freq_sequence = highest_sequence[0][1]
  146. if freq_sequence > 4:
  147. pcr_consensus = candidate_sequence
  148. else:
  149. pcr_consensus = 'inconSequenceSite'
  150. # pcr-free
  151. pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings)
  152. #SEQ2000 = decide_by_rep(pcr_free[0])
  153. Nova_ARD_1 = decide_by_rep(pcr_free[1:4])
  154. Nova_ARD_2 = decide_by_rep(pcr_free[4:7])
  155. Nova_BRG = decide_by_rep(pcr_free[7:10])
  156. sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
  157. highest_sequence = sequence_dict.most_common(1)
  158. candidate_sequence = highest_sequence[0][0]
  159. freq_sequence = highest_sequence[0][1]
  160. if freq_sequence > 2:
  161. pcr_free_consensus = candidate_sequence
  162. else:
  163. pcr_free_consensus = 'inconSequenceSite'
  164. # pcr and pcr-free
  165. tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
  166. if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
  167. consensus_call = pcr_consensus
  168. VPCT = vote_percentage(strings[9:],consensus_call)
  169. strings[7] = 'VPCT=' + VPCT
  170. DPCT = detected_percentage(strings[9:])
  171. strings[7] = strings[7] + ';DPCT=' + DPCT
  172. # Delete multiple alternative genotype to necessary expression
  173. strings[6] = 'reproducible'
  174. alt = strings[4]
  175. alt_gt = alt.split(',')
  176. if len(alt_gt) > 1:
  177. allele1 = consensus_call.split('/')[0]
  178. allele2 = consensus_call.split('/')[1]
  179. if allele1 == '0':
  180. allele2_seq = alt_gt[int(allele2) - 1]
  181. consensus_alt_seq = allele2_seq
  182. consensus_call = '0/1'
  183. else:
  184. allele1_seq = alt_gt[int(allele1) - 1]
  185. allele2_seq = alt_gt[int(allele2) - 1]
  186. if int(allele1) > int(allele2):
  187. consensus_alt_seq = allele2_seq + ',' + allele1_seq
  188. consensus_call = '1/2'
  189. elif int(allele1) < int(allele2):
  190. consensus_alt_seq = allele1_seq + ',' + allele2_seq
  191. consensus_call = '1/2'
  192. else:
  193. consensus_alt_seq = allele1_seq
  194. consensus_call = '1/1'
  195. else:
  196. consensus_alt_seq = alt
  197. elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
  198. consensus_call = 'filtered'
  199. strings[6] = 'filtered'
  200. DPCT = detected_percentage(strings[9:])
  201. strings[7] = 'DPCT=' + DPCT
  202. elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
  203. consensus_call = 'pcr-free-speicifc'
  204. strings[6] = 'pcr-free-speicifc'
  205. DPCT = detected_percentage(strings[9:])
  206. strings[7] = 'DPCT=' + DPCT
  207. elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
  208. consensus_call = 'pcr-speicifc'
  209. strings[6] = 'pcr-speicifc'
  210. DPCT = detected_percentage(strings[9:])
  211. strings[7] = 'DPCT=' + DPCT
  212. elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
  213. consensus_call = 'confirm for parents'
  214. strings[6] = 'confirm for parents'
  215. DPCT = detected_percentage(strings[9:])
  216. strings[7] = 'DPCT=' + DPCT
  217. else:
  218. consensus_call = 'filtered'
  219. strings[6] = 'filtered'
  220. DPCT = detected_percentage(strings[9:])
  221. strings[7] = 'DPCT=' + DPCT
  222. # output
  223. outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n'
  224. outfile.write(outLine)
  225. if __name__ == '__main__':
  226. main()