You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 line
2.0KB

  1. import sys,getopt
  2. import os
  3. import re
  4. import fileinput
  5. import pandas as pd
  6. def usage():
  7. print(
  8. """
  9. Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file
  10. This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.
  11. Input:
  12. -i the selected vcf file
  13. Output:
  14. -o tab-delimited parsed file
  15. """)
  16. # select supported small variants
  17. def process(oneLine):
  18. line = oneLine.rstrip()
  19. strings = line.strip().split('\t')
  20. infoParsed = parse_INFO(strings[7])
  21. formatKeys = strings[8].split(':')
  22. formatValues = strings[9].split(':')
  23. for i in range(0,len(formatKeys) -1) :
  24. if formatKeys[i] == 'AD':
  25. ra = formatValues[i].split(',')
  26. infoParsed['RefDP'] = ra[0]
  27. infoParsed['AltDP'] = ra[1]
  28. if (int(ra[1]) + int(ra[0])) != 0:
  29. infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
  30. else:
  31. pass
  32. else:
  33. infoParsed[formatKeys[i]] = formatValues[i]
  34. infoParsed['chromo'] = strings[0]
  35. infoParsed['pos'] = strings[1]
  36. infoParsed['id'] = strings[2]
  37. infoParsed['ref'] = strings[3]
  38. infoParsed['alt'] = strings[4]
  39. infoParsed['qual'] = strings[5]
  40. return infoParsed
  41. def parse_INFO(info):
  42. strings = info.strip().split(';')
  43. keys = []
  44. values = []
  45. for i in strings:
  46. kv = i.split('=')
  47. if kv[0] == 'DB':
  48. keys.append('DB')
  49. values.append('1')
  50. elif kv[0] == 'AF':
  51. pass
  52. elif kv[0] == 'POSITIVE_TRAIN_SITE':
  53. pass
  54. elif kv[0] == 'NEGATIVE_TRAIN_SITE':
  55. pass
  56. else:
  57. keys.append(kv[0])
  58. values.append(kv[1])
  59. infoDict = dict(zip(keys, values))
  60. return infoDict
  61. opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
  62. for op,value in opts:
  63. if op == "-i":
  64. inputFile=value
  65. elif op == "-o":
  66. outputFile=value
  67. elif op == "-h":
  68. usage()
  69. sys.exit()
  70. if len(sys.argv[1:]) < 3:
  71. usage()
  72. sys.exit()
  73. allDict = []
  74. for line in fileinput.input(inputFile):
  75. m = re.match('^\#',line)
  76. if m is not None:
  77. pass
  78. else:
  79. oneDict = process(line)
  80. allDict.append(oneDict)
  81. allTable = pd.DataFrame(allDict)
  82. allTable.to_csv(outputFile,sep='\t',index=False)