You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

198 line
8.2KB

  1. # import modules
  2. import sys, argparse, os
  3. import fileinput
  4. import re
  5. import pandas as pd
  6. from operator import itemgetter
  7. from collections import Counter
  8. from itertools import islice
  9. # input arguments
  10. parser = argparse.ArgumentParser(description="this script is to count voting number")
  11. parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
  12. parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
  13. parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
  14. parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)
  15. args = parser.parse_args()
  16. multi_sample_vcf = args.multi_sample_vcf
  17. dup_list = args.dup_list
  18. prefix = args.prefix
  19. sample_name = args.sample_name
  20. vcf_header = '''##fileformat=VCFv4.2
  21. ##fileDate=20191224
  22. ##source=high_confidence_calls_intergration(choppy app)
  23. ##reference=GRCh38.d1.vd1
  24. ##INFO=<ID=PCT,Number=1,Type=Float,Description="Percentage of votes">
  25. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  26. ##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
  27. ##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
  28. ##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
  29. ##contig=<ID=chr1,length=248956422>
  30. ##contig=<ID=chr2,length=242193529>
  31. ##contig=<ID=chr3,length=198295559>
  32. ##contig=<ID=chr4,length=190214555>
  33. ##contig=<ID=chr5,length=181538259>
  34. ##contig=<ID=chr6,length=170805979>
  35. ##contig=<ID=chr7,length=159345973>
  36. ##contig=<ID=chr8,length=145138636>
  37. ##contig=<ID=chr9,length=138394717>
  38. ##contig=<ID=chr10,length=133797422>
  39. ##contig=<ID=chr11,length=135086622>
  40. ##contig=<ID=chr12,length=133275309>
  41. ##contig=<ID=chr13,length=114364328>
  42. ##contig=<ID=chr14,length=107043718>
  43. ##contig=<ID=chr15,length=101991189>
  44. ##contig=<ID=chr16,length=90338345>
  45. ##contig=<ID=chr17,length=83257441>
  46. ##contig=<ID=chr18,length=80373285>
  47. ##contig=<ID=chr19,length=58617616>
  48. ##contig=<ID=chr20,length=64444167>
  49. ##contig=<ID=chr21,length=46709983>
  50. ##contig=<ID=chr22,length=50818468>
  51. ##contig=<ID=chrX,length=156040895>
  52. '''
  53. # read in duplication list
  54. dup = pd.read_table(dup_list,header=None)
  55. var_dup = dup[0].tolist()
  56. # output file
  57. file_name = prefix + '_annotated.vcf'
  58. outfile = open(file_name,'w')
  59. # write VCF
  60. outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'consensus' + '\n'
  61. outfile.write(vcf_header)
  62. outfile.write(outputcolumn)
  63. #function
  64. def vote_percentage(strings):
  65. strings = [x.replace('0/0','.') for x in strings]
  66. gt = [x.split(':')[0] for x in strings]
  67. percentage = round((33 - gt.count('.'))/33,2)
  68. return(str(percentage))
  69. def decide_by_rep(strings):
  70. consensus_rep = ''
  71. mendelian = [x[-5:] for x in strings]
  72. strings = [x.replace('.','0/0') for x in strings]
  73. gt = [x.split(':')[0] for x in strings]
  74. # mendelian consistent?
  75. mendelian_dict = Counter(mendelian)
  76. highest_mendelian = mendelian_dict.most_common(1)
  77. candidate_mendelian = highest_mendelian[0][0]
  78. freq_mendelian = highest_mendelian[0][1]
  79. if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
  80. gt_num_dict = Counter(gt)
  81. highest_gt = gt_num_dict.most_common(1)
  82. candidate_gt = highest_gt[0][0]
  83. freq_gt = highest_gt[0][1]
  84. if (candidate_gt != '0/0') and (freq_gt >= 2):
  85. consensus_rep = candidate_gt
  86. elif (candidate_gt == '0/0') and (freq_gt >= 2):
  87. consensus_rep = '0/0'
  88. else:
  89. consensus_rep = 'inconGT'
  90. elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
  91. consensus_rep = 'noInfo'
  92. else:
  93. consensus_rep = 'inconMen'
  94. return consensus_rep
  95. def main():
  96. for line in fileinput.input(multi_sample_vcf):
  97. headline = re.match('^\#',line)
  98. if headline is not None:
  99. pass
  100. else:
  101. line = line.strip()
  102. strings = line.split('\t')
  103. variant_id = '_'.join([strings[0],strings[1]])
  104. # check if the variants location is duplicated
  105. if variant_id in var_dup:
  106. outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\n'
  107. outfile.write(outLine)
  108. else:
  109. # pre-define
  110. pcr_consensus = ''
  111. pcr_free_consensus = ''
  112. consensus_call = ''
  113. # pcr
  114. pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings)
  115. SEQ2000 = decide_by_rep(pcr[0:4])
  116. SEQ500 = decide_by_rep(pcr[4:7])
  117. Nova = decide_by_rep(pcr[7:11])
  118. XTen_ARD = decide_by_rep(pcr[11:14])
  119. XTen_NVG = decide_by_rep(pcr[14:17])
  120. XTen_WUX_1 = decide_by_rep(pcr[17:20])
  121. XTen_WUX_2 = decide_by_rep(pcr[20:23])
  122. sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2]
  123. sequence_dict = Counter(sequence_site)
  124. highest_sequence = sequence_dict.most_common(1)
  125. candidate_sequence = highest_sequence[0][0]
  126. freq_sequence = highest_sequence[0][1]
  127. if freq_sequence > 4:
  128. pcr_consensus = candidate_sequence
  129. else:
  130. pcr_consensus = 'inconSequenceSite'
  131. # pcr-free
  132. pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings)
  133. SEQ2000 = decide_by_rep(pcr_free[0])
  134. Nova_ARD_1 = decide_by_rep(pcr_free[1:4])
  135. Nova_ARD_2 = decide_by_rep(pcr_free[4:7])
  136. Nova_BRG = decide_by_rep(pcr_free[7:10])
  137. sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
  138. highest_sequence = sequence_dict.most_common(1)
  139. candidate_sequence = highest_sequence[0][0]
  140. freq_sequence = highest_sequence[0][1]
  141. if freq_sequence > 2:
  142. pcr_free_consensus = candidate_sequence
  143. else:
  144. pcr_free_consensus = 'inconSequenceSite'
  145. # pcr and pcr-free
  146. tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
  147. if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
  148. consensus_call = pcr_consensus
  149. strings[6] = 'reproducible'
  150. elif (pcr_consensus in tag) or (pcr_free_consensus in tag):
  151. consensus_call = 'filtered'
  152. strings[6] = '.'
  153. elif (pcr_consensus == '0/0') and (pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0'):
  154. consensus_call = 'pcr-free-speicifc'
  155. strings[6] = '.'
  156. elif (pcr_consensus != '0/0') and (pcr_consensus not in tag) and (pcr_free_consensus == '0/0'):
  157. consensus_call = 'pcr-speicifc'
  158. strings[6] = '.'
  159. elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
  160. consensus_call = 'confirm for parents'
  161. strings[6] = '.'
  162. else:
  163. consensus_call = 'filtered'
  164. strings[6] = '.'
  165. # percentage
  166. percentage = vote_percentage(strings[9:])
  167. strings[7] = 'PCT=' + percentage
  168. # output
  169. outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\n'
  170. outfile.write(outLine)
  171. if __name__ == '__main__':
  172. main()