Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

93 lines
1.9KB

  1. import sys,getopt
  2. import os
  3. import re
  4. import fileinput
  5. import pandas as pd
  6. def usage():
  7. print(
  8. """
  9. Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file
  10. This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.
  11. Input:
  12. -i the selected vcf file
  13. Output:
  14. -o tab-delimited parsed file
  15. """)
  16. # select supported small variants
  17. def process(oneLine):
  18. line = oneLine.rstrip()
  19. strings = line.strip().split('\t')
  20. infoParsed = parse_INFO(strings[7])
  21. formatKeys = strings[8].split(':')
  22. formatValues = strings[9].split(':')
  23. for i in range(0,len(formatKeys) -1) :
  24. if formatKeys[i] == 'AD':
  25. ra = formatValues[i].split(',')
  26. infoParsed['RefDP'] = ra[0]
  27. infoParsed['AltDP'] = ra[1]
  28. if (int(ra[1]) + int(ra[0])) != 0:
  29. infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
  30. else:
  31. pass
  32. else:
  33. infoParsed[formatKeys[i]] = formatValues[i]
  34. infoParsed['chromo'] = strings[0]
  35. infoParsed['pos'] = strings[1]
  36. infoParsed['id'] = strings[2]
  37. infoParsed['ref'] = strings[3]
  38. infoParsed['alt'] = strings[4]
  39. infoParsed['qual'] = strings[5]
  40. return infoParsed
  41. def parse_INFO(info):
  42. strings = info.strip().split(';')
  43. keys = []
  44. values = []
  45. for i in strings:
  46. kv = i.split('=')
  47. if kv[0] == 'DB':
  48. keys.append('DB')
  49. values.append('1')
  50. elif kv[0] == 'AF':
  51. pass
  52. else:
  53. keys.append(kv[0])
  54. values.append(kv[1])
  55. infoDict = dict(zip(keys, values))
  56. return infoDict
  57. opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
  58. for op,value in opts:
  59. if op == "-i":
  60. inputFile=value
  61. elif op == "-o":
  62. outputFile=value
  63. elif op == "-h":
  64. usage()
  65. sys.exit()
  66. if len(sys.argv[1:]) < 3:
  67. usage()
  68. sys.exit()
  69. allDict = []
  70. for line in fileinput.input(inputFile):
  71. m = re.match('^\#',line)
  72. if m is not None:
  73. pass
  74. else:
  75. oneDict = process(line)
  76. allDict.append(oneDict)
  77. allTable = pd.DataFrame(allDict)
  78. allTable.to_csv(outputFile,sep='\t',index=False)