You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
1.9KB

  1. import sys,getopt
  2. import os
  3. import re
  4. import fileinput
  5. import pandas as pd
  6. def usage():
  7. print(
  8. """
  9. Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file
  10. This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.
  11. Input:
  12. -i the selected vcf file
  13. Output:
  14. -o tab-delimited parsed file
  15. """)
  16. # select supported small variants
  17. def process(oneLine):
  18. line = oneLine.rstrip()
  19. strings = line.strip().split('\t')
  20. infoParsed = parse_INFO(strings[7])
  21. formatKeys = strings[8].split(':')
  22. formatValues = strings[9].split(':')
  23. for i in range(0,len(formatKeys) -1) :
  24. if formatKeys[i] == 'AD':
  25. ra = formatValues[i].split(',')
  26. infoParsed['RefDP'] = ra[0]
  27. infoParsed['AltDP'] = ra[1]
  28. if (int(ra[1]) + int(ra[0])) != 0:
  29. infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
  30. else:
  31. pass
  32. else:
  33. infoParsed[formatKeys[i]] = formatValues[i]
  34. infoParsed['chromo'] = strings[0]
  35. infoParsed['pos'] = strings[1]
  36. infoParsed['id'] = strings[2]
  37. infoParsed['ref'] = strings[3]
  38. infoParsed['alt'] = strings[4]
  39. infoParsed['qual'] = strings[5]
  40. return infoParsed
  41. def parse_INFO(info):
  42. strings = info.strip().split(';')
  43. keys = []
  44. values = []
  45. for i in strings:
  46. kv = i.split('=')
  47. if kv[0] == 'DB':
  48. keys.append('DB')
  49. values.append('1')
  50. elif kv[0] == 'AF':
  51. pass
  52. else:
  53. keys.append(kv[0])
  54. values.append(kv[1])
  55. infoDict = dict(zip(keys, values))
  56. return infoDict
  57. opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
  58. for op,value in opts:
  59. if op == "-i":
  60. inputFile=value
  61. elif op == "-o":
  62. outputFile=value
  63. elif op == "-h":
  64. usage()
  65. sys.exit()
  66. if len(sys.argv[1:]) < 3:
  67. usage()
  68. sys.exit()
  69. allDict = []
  70. for line in fileinput.input(inputFile):
  71. m = re.match('^\#',line)
  72. if m is not None:
  73. pass
  74. else:
  75. oneDict = process(line)
  76. allDict.append(oneDict)
  77. allTable = pd.DataFrame(allDict)
  78. allTable.to_csv(outputFile,sep='\t',index=False)