You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

228 lines
12KB

  1. from __future__ import division
  2. from glob import glob
  3. import sys, argparse, os
  4. import fileinput
  5. import re
  6. import pandas as pd
  7. from operator import itemgetter
  8. from collections import Counter
  9. from itertools import islice
  10. from numpy import *
  11. import statistics
  12. # input arguments
  13. parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")
  14. parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True)
  15. parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True)
  16. args = parser.parse_args()
  17. prefix = args.prefix
  18. vcf = args.vcf
  19. # input files
  20. vcf_dat = pd.read_table(vcf)
  21. # all info
  22. all_file_name = prefix + "_all_summary.txt"
  23. all_sample_outfile = open(all_file_name,'w')
  24. all_info_col = 'CHROM\tPOS\tREF\tALT\tLCL5_consensus_calls\tLCL5_detect_number\tLCL5_same_diff\tLCL6_consensus_calls\tLCL6_detect_number\tLCl6_same_diff\tLCL7_consensus_calls\tLCL7_detect_number\tLCL7_same_diff\tLCL8_consensus_calls\tLCL8_detect_number\tLCL8_same_diff\n'
  25. all_sample_outfile.write(all_info_col)
  26. # filtered info
  27. vcf_header = '''##fileformat=VCFv4.2
  28. ##fileDate=20200501
  29. ##source=high_confidence_calls_intergration(choppy app)
  30. ##reference=GRCh38.d1.vd1
  31. ##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
  32. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  33. ##contig=<ID=chr1,length=248956422>
  34. ##contig=<ID=chr2,length=242193529>
  35. ##contig=<ID=chr3,length=198295559>
  36. ##contig=<ID=chr4,length=190214555>
  37. ##contig=<ID=chr5,length=181538259>
  38. ##contig=<ID=chr6,length=170805979>
  39. ##contig=<ID=chr7,length=159345973>
  40. ##contig=<ID=chr8,length=145138636>
  41. ##contig=<ID=chr9,length=138394717>
  42. ##contig=<ID=chr10,length=133797422>
  43. ##contig=<ID=chr11,length=135086622>
  44. ##contig=<ID=chr12,length=133275309>
  45. ##contig=<ID=chr13,length=114364328>
  46. ##contig=<ID=chr14,length=107043718>
  47. ##contig=<ID=chr15,length=101991189>
  48. ##contig=<ID=chr16,length=90338345>
  49. ##contig=<ID=chr17,length=83257441>
  50. ##contig=<ID=chr18,length=80373285>
  51. ##contig=<ID=chr19,length=58617616>
  52. ##contig=<ID=chr20,length=64444167>
  53. ##contig=<ID=chr21,length=46709983>
  54. ##contig=<ID=chr22,length=50818468>
  55. ##contig=<ID=chrX,length=156040895>
  56. '''
  57. consensus_file_name = prefix + "_consensus.vcf"
  58. consensus_outfile = open(consensus_file_name,'w')
  59. consensus_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_consensus_call\tLCL6_consensus_call\tLCL7_consensus_call\tLCL8_consensus_call\n'
  60. consensus_outfile.write(vcf_header)
  61. consensus_outfile.write(consensus_col)
  62. # function
  63. def decide_by_rep(vcf_list):
  64. consensus_rep = ''
  65. gt = [x.split(':')[0] for x in vcf_list]
  66. gt_num_dict = Counter(gt)
  67. highest_gt = gt_num_dict.most_common(1)
  68. candidate_gt = highest_gt[0][0]
  69. freq_gt = highest_gt[0][1]
  70. if freq_gt >= 2:
  71. consensus_rep = candidate_gt
  72. else:
  73. consensus_rep = 'inconGT'
  74. return consensus_rep
  75. def consensus_call(vcf_info_list):
  76. consensus_call = '.'
  77. detect_number = '.'
  78. same_diff = '.'
  79. # pcr
  80. SEQ2000 = decide_by_rep(vcf_info_list[0:3])
  81. XTen_ARD = decide_by_rep(vcf_info_list[18:21])
  82. XTen_NVG = decide_by_rep(vcf_info_list[21:24])
  83. XTen_WUX = decide_by_rep(vcf_info_list[24:27])
  84. Nova_WUX = decide_by_rep(vcf_info_list[15:18])
  85. pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX,Nova_WUX]
  86. pcr_sequence_dict = Counter(pcr_sequence_site)
  87. pcr_highest_sequence = pcr_sequence_dict.most_common(1)
  88. pcr_candidate_sequence = pcr_highest_sequence[0][0]
  89. pcr_freq_sequence = pcr_highest_sequence[0][1]
  90. if pcr_freq_sequence > 3:
  91. pcr_consensus = pcr_candidate_sequence
  92. else:
  93. pcr_consensus = 'inconSequenceSite'
  94. # pcr-free
  95. T7_WGE = decide_by_rep(vcf_info_list[3:6])
  96. Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9])
  97. Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12])
  98. Nova_BRG = decide_by_rep(vcf_info_list[12:15])
  99. sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
  100. sequence_dict = Counter(sequence_site)
  101. highest_sequence = sequence_dict.most_common(1)
  102. candidate_sequence = highest_sequence[0][0]
  103. freq_sequence = highest_sequence[0][1]
  104. if freq_sequence > 2:
  105. pcr_free_consensus = candidate_sequence
  106. else:
  107. pcr_free_consensus = 'inconSequenceSite'
  108. gt = [x.split(':')[0] for x in vcf_info_list]
  109. gt = [x.replace('./.','.') for x in gt]
  110. detected_num = 27 - gt.count('.')
  111. gt_remain = [e for e in gt if e not in {'.'}]
  112. gt_set = set(gt_remain)
  113. if len(gt_set) == 1:
  114. same_diff = 'same'
  115. else:
  116. same_diff = 'diff'
  117. tag = ['inconGT','inconSequenceSite']
  118. if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag):
  119. consensus_call = pcr_consensus
  120. elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
  121. consensus_call = 'notAgree'
  122. else:
  123. consensus_call = 'notConsensus'
  124. return consensus_call, detected_num, same_diff
  125. elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
  126. consensus_call = 'filtered'
  127. elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')):
  128. consensus_call = 'pcr-free-speicifc'
  129. elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')):
  130. consensus_call = 'pcr-speicifc'
  131. elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
  132. consensus_call = '0/0'
  133. else:
  134. consensus_call = 'filtered'
  135. for row in vcf_dat.itertuples():
  136. # length
  137. #alt
  138. if ',' in row.ALT:
  139. alt = row.ALT.split(',')
  140. alt_len = [len(i) for i in alt]
  141. alt_max = max(alt_len)
  142. else:
  143. alt_max = len(row.ALT)
  144. #ref
  145. ref_len = len(row.REF)
  146. if (alt_max > 50) or (ref_len > 50):
  147. pass
  148. else:
  149. # consensus
  150. lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
  151. row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
  152. row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
  153. row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
  154. row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
  155. row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
  156. row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
  157. row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
  158. row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
  159. lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \
  160. row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \
  161. row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \
  162. row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \
  163. row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \
  164. row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \
  165. row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \
  166. row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \
  167. row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216]
  168. lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \
  169. row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \
  170. row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \
  171. row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \
  172. row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \
  173. row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \
  174. row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \
  175. row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \
  176. row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216]
  177. lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \
  178. row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \
  179. row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \
  180. row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \
  181. row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \
  182. row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \
  183. row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \
  184. row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \
  185. row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216]
  186. # LCL5
  187. LCL5_consensus_call, LCL5_detected_num, LCL5_same_diff = consensus_call(lcl5_list)
  188. # LCL6
  189. LCL6_consensus_call, LCL6_detected_num, LCL6_same_diff = consensus_call(lcl6_list)
  190. # LCL7
  191. LCL7_consensus_call, LCL7_detected_num, LCL7_same_diff = consensus_call(lcl7_list)
  192. # LCL8
  193. LCL8_consensus_call, LCL8_detected_num, LCL8_same_diff = consensus_call(lcl8_list)
  194. # all data
  195. all_output = row._1 + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + LCL5_consensus_call + '\t' + str(LCL5_detected_num) + '\t' + LCL5_same_diff + '\t' +\
  196. LCL6_consensus_call + '\t' + str(LCL6_detected_num) + '\t' + LCL6_same_diff + '\t' +\
  197. LCL7_consensus_call + '\t' + str(LCL7_detected_num) + '\t' + LCL7_same_diff + '\t' +\
  198. LCL8_consensus_call + '\t' + str(LCL8_detected_num) + '\t' + LCL8_same_diff + '\n'
  199. all_sample_outfile.write(all_output)
  200. #consensus vcf
  201. one_position = [LCL5_consensus_call,LCL6_consensus_call,LCL7_consensus_call,LCL8_consensus_call]
  202. if ('notConsensus' in one_position) or (((len(set(one_position)) == 1) and ('./.' in set(one_position))) or ((len(set(one_position)) == 1) and ('0/0' in set(one_position))) or ((len(set(one_position)) == 2) and ('0/0' in set(one_position) and ('./.' in set(one_position))))):
  203. pass
  204. else:
  205. consensus_output = row._1 + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + row.ALT + '\t' + '.' + '\t' + '.' + '\t' +'.' + '\t' + 'GT' + '\t' + LCL5_consensus_call + '\t' + LCL6_consensus_call + '\t' + LCL7_consensus_call + '\t' + LCL8_consensus_call +'\n'
  206. consensus_outfile.write(consensus_output)