You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

73 lines
1.9KB

  1. import pandas as pd
  2. import sys, argparse, os
  3. mut = mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/vcf/mutation_type',header=None)
  4. vote = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/benchmark.vote.mendelian.txt',header=None)
  5. merged_df = pd.merge(vote, mut, how='inner', left_on=[0,1], right_on = [0,1])
  6. outFile = open(sys.argv[1],'w')
  7. outIndel = open(sys.argv[2],'w')
  8. for row in merged_df.itertuples():
  9. #d5
  10. if ',' in row._7:
  11. d5 = row._7.split(',')
  12. d5_len = [len(i) for i in d5]
  13. d5_alt = max(d5_len)
  14. else:
  15. d5_alt = len(row._7)
  16. #d6
  17. if ',' in row._15:
  18. d6 = row._15.split(',')
  19. d6_len = [len(i) for i in d6]
  20. d6_alt = max(d6_len)
  21. else:
  22. d6_alt = len(row._15)
  23. #f7
  24. if ',' in row._23:
  25. f7 = row._23.split(',')
  26. f7_len = [len(i) for i in f7]
  27. f7_alt = max(f7_len)
  28. else:
  29. f7_alt = len(row._23)
  30. #m8
  31. if ',' in row._31:
  32. m8 = row._31.split(',')
  33. m8_len = [len(i) for i in m8]
  34. m8_alt = max(m8_len)
  35. else:
  36. m8_alt = len(row._31)
  37. all_length = [d5_alt,d6_alt,f7_alt,m8_alt]
  38. alt = max(all_length)
  39. ref = row._35
  40. pos = int(row._2)
  41. if len(ref) == 1 and alt == 1:
  42. StartPos = int(pos) -1
  43. EndPos = int(pos)
  44. cate = 'SNV'
  45. elif len(ref) > alt:
  46. StartPos = int(pos) - 1
  47. EndPos = int(pos) + (len(ref) - 1)
  48. cate = 'INDEL'
  49. outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
  50. outIndel.write(outline_indel)
  51. elif alt > len(ref):
  52. StartPos = int(pos) - 1
  53. EndPos = int(pos) + (alt - 1)
  54. cate = 'INDEL'
  55. outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
  56. outIndel.write(outline_indel)
  57. elif len(ref) == alt:
  58. StartPos = int(pos) - 1
  59. EndPos = int(pos) + (alt - 1)
  60. cate = 'INDEL'
  61. outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
  62. outIndel.write(outline_indel)
  63. outline = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row._2) + '\t' + cate + '\n'
  64. outFile.write(outline)