Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

replicates_consensus.py 12KB

5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. from __future__ import division
  2. from glob import glob
  3. import sys, argparse, os
  4. import fileinput
  5. import re
  6. import pandas as pd
  7. from operator import itemgetter
  8. from collections import Counter
  9. from itertools import islice
  10. from numpy import *
  11. import statistics
  12. # input arguments
  13. parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")
  14. parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True)
  15. parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True)
  16. args = parser.parse_args()
  17. prefix = args.prefix
  18. vcf = args.vcf
  19. # input files
  20. vcf_dat = pd.read_table(vcf)
  21. # all info
  22. all_file_name = prefix + "_all_summary.txt"
  23. all_sample_outfile = open(all_file_name,'w')
  24. all_info_col = 'CHROM\tPOS\tREF\tALT\tLCL5_consensus_calls\tLCL5_detect_number\tLCL5_same_diff\tLCL6_consensus_calls\tLCL6_detect_number\tLCl6_same_diff\tLCL7_consensus_calls\tLCL7_detect_number\tLCL7_same_diff\tLCL8_consensus_calls\tLCL8_detect_number\tLCL8_same_diff\n'
  25. all_sample_outfile.write(all_info_col)
  26. # filtered info
  27. vcf_header = '''##fileformat=VCFv4.2
  28. ##fileDate=20200501
  29. ##source=high_confidence_calls_intergration(choppy app)
  30. ##reference=GRCh38.d1.vd1
  31. ##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
  32. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  33. ##contig=<ID=chr1,length=248956422>
  34. ##contig=<ID=chr2,length=242193529>
  35. ##contig=<ID=chr3,length=198295559>
  36. ##contig=<ID=chr4,length=190214555>
  37. ##contig=<ID=chr5,length=181538259>
  38. ##contig=<ID=chr6,length=170805979>
  39. ##contig=<ID=chr7,length=159345973>
  40. ##contig=<ID=chr8,length=145138636>
  41. ##contig=<ID=chr9,length=138394717>
  42. ##contig=<ID=chr10,length=133797422>
  43. ##contig=<ID=chr11,length=135086622>
  44. ##contig=<ID=chr12,length=133275309>
  45. ##contig=<ID=chr13,length=114364328>
  46. ##contig=<ID=chr14,length=107043718>
  47. ##contig=<ID=chr15,length=101991189>
  48. ##contig=<ID=chr16,length=90338345>
  49. ##contig=<ID=chr17,length=83257441>
  50. ##contig=<ID=chr18,length=80373285>
  51. ##contig=<ID=chr19,length=58617616>
  52. ##contig=<ID=chr20,length=64444167>
  53. ##contig=<ID=chr21,length=46709983>
  54. ##contig=<ID=chr22,length=50818468>
  55. ##contig=<ID=chrX,length=156040895>
  56. '''
  57. consensus_file_name = prefix + "_consensus.vcf"
  58. consensus_outfile = open(consensus_file_name,'w')
  59. consensus_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_consensus_call\tLCL6_consensus_call\tLCL7_consensus_call\tLCL8_consensus_call\n'
  60. consensus_outfile.write(vcf_header)
  61. consensus_outfile.write(consensus_col)
  62. # function
  63. def decide_by_rep(vcf_list):
  64. consensus_rep = ''
  65. gt = [x.split(':')[0] for x in vcf_list]
  66. gt_num_dict = Counter(gt)
  67. highest_gt = gt_num_dict.most_common(1)
  68. candidate_gt = highest_gt[0][0]
  69. freq_gt = highest_gt[0][1]
  70. if freq_gt >= 2:
  71. consensus_rep = candidate_gt
  72. else:
  73. consensus_rep = 'inconGT'
  74. return consensus_rep
  75. def consensus_call(vcf_info_list):
  76. consensus_call = '.'
  77. detect_number = '.'
  78. same_diff = '.'
  79. # pcr
  80. SEQ2000 = decide_by_rep(vcf_info_list[0:3])
  81. XTen_ARD = decide_by_rep(vcf_info_list[18:21])
  82. XTen_NVG = decide_by_rep(vcf_info_list[21:24])
  83. XTen_WUX = decide_by_rep(vcf_info_list[24:27])
  84. Nova_WUX = decide_by_rep(vcf_info_list[15:18])
  85. pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX,Nova_WUX]
  86. pcr_sequence_dict = Counter(pcr_sequence_site)
  87. pcr_highest_sequence = pcr_sequence_dict.most_common(1)
  88. pcr_candidate_sequence = pcr_highest_sequence[0][0]
  89. pcr_freq_sequence = pcr_highest_sequence[0][1]
  90. if pcr_freq_sequence > 3:
  91. pcr_consensus = pcr_candidate_sequence
  92. else:
  93. pcr_consensus = 'inconSequenceSite'
  94. # pcr-free
  95. T7_WGE = decide_by_rep(vcf_info_list[3:6])
  96. Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9])
  97. Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12])
  98. Nova_BRG = decide_by_rep(vcf_info_list[12:15])
  99. sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
  100. sequence_dict = Counter(sequence_site)
  101. highest_sequence = sequence_dict.most_common(1)
  102. candidate_sequence = highest_sequence[0][0]
  103. freq_sequence = highest_sequence[0][1]
  104. if freq_sequence > 2:
  105. pcr_free_consensus = candidate_sequence
  106. else:
  107. pcr_free_consensus = 'inconSequenceSite'
  108. gt = [x.split(':')[0] for x in vcf_info_list]
  109. gt = [x.replace('./.','.') for x in gt]
  110. detected_num = 27 - gt.count('.')
  111. gt_remain = [e for e in gt if e not in {'.'}]
  112. gt_set = set(gt_remain)
  113. if len(gt_set) == 1:
  114. same_diff = 'same'
  115. else:
  116. same_diff = 'diff'
  117. tag = ['inconGT','inconSequenceSite']
  118. if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag):
  119. consensus_call = pcr_consensus
  120. elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
  121. consensus_call = 'notAgree'
  122. else:
  123. consensus_call = 'notConsensus'
  124. return consensus_call, detected_num, same_diff
  125. elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
  126. consensus_call = 'filtered'
  127. elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')):
  128. consensus_call = 'pcr-free-speicifc'
  129. elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')):
  130. consensus_call = 'pcr-speicifc'
  131. elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
  132. consensus_call = '0/0'
  133. else:
  134. consensus_call = 'filtered'
  135. for row in vcf_dat.itertuples():
  136. # length
  137. #alt
  138. if ',' in row.ALT:
  139. alt = row.ALT.split(',')
  140. alt_len = [len(i) for i in alt]
  141. alt_max = max(alt_len)
  142. else:
  143. alt_max = len(row.ALT)
  144. #ref
  145. ref_len = len(row.REF)
  146. if (alt_max > 50) or (ref_len > 50):
  147. pass
  148. else:
  149. # consensus
  150. lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
  151. row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
  152. row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
  153. row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
  154. row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
  155. row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
  156. row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
  157. row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
  158. row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
  159. lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \
  160. row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \
  161. row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \
  162. row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \
  163. row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \
  164. row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \
  165. row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \
  166. row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \
  167. row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216]
  168. lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \
  169. row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \
  170. row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \
  171. row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \
  172. row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \
  173. row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \
  174. row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \
  175. row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \
  176. row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216]
  177. lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \
  178. row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \
  179. row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \
  180. row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \
  181. row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \
  182. row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \
  183. row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \
  184. row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \
  185. row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216]
  186. # LCL5
  187. LCL5_consensus_call, LCL5_detected_num, LCL5_same_diff = consensus_call(lcl5_list)
  188. # LCL6
  189. LCL6_consensus_call, LCL6_detected_num, LCL6_same_diff = consensus_call(lcl6_list)
  190. # LCL7
  191. LCL7_consensus_call, LCL7_detected_num, LCL7_same_diff = consensus_call(lcl7_list)
  192. # LCL8
  193. LCL8_consensus_call, LCL8_detected_num, LCL8_same_diff = consensus_call(lcl8_list)
  194. # all data
  195. all_output = row._1 + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + LCL5_consensus_call + '\t' + str(LCL5_detected_num) + '\t' + LCL5_same_diff + '\t' +\
  196. LCL6_consensus_call + '\t' + str(LCL6_detected_num) + '\t' + LCL6_same_diff + '\t' +\
  197. LCL7_consensus_call + '\t' + str(LCL7_detected_num) + '\t' + LCL7_same_diff + '\t' +\
  198. LCL8_consensus_call + '\t' + str(LCL8_detected_num) + '\t' + LCL8_same_diff + '\n'
  199. all_sample_outfile.write(all_output)
  200. #consensus vcf
  201. one_position = [LCL5_consensus_call,LCL6_consensus_call,LCL7_consensus_call,LCL8_consensus_call]
  202. if ('notConsensus' in one_position) or (((len(set(one_position)) == 1) and ('./.' in set(one_position))) or ((len(set(one_position)) == 1) and ('0/0' in set(one_position))) or ((len(set(one_position)) == 2) and ('0/0' in set(one_position) and ('./.' in set(one_position))))):
  203. pass
  204. else:
  205. consensus_output = row._1 + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + row.ALT + '\t' + '.' + '\t' + '.' + '\t' +'.' + '\t' + 'GT' + '\t' + LCL5_consensus_call + '\t' + LCL6_consensus_call + '\t' + LCL7_consensus_call + '\t' + LCL8_consensus_call +'\n'
  206. consensus_outfile.write(consensus_output)