|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- import sys,getopt
- import os
- import re
- import fileinput
- import pandas as pd
-
- def usage():
- print(
- """
- Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file
-
- This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.
-
- Input:
- -i the selected vcf file
-
- Output:
- -o tab-delimited parsed file
- """)
-
- # select supported small variants
- def process(oneLine):
- line = oneLine.rstrip()
- strings = line.strip().split('\t')
- infoParsed = parse_INFO(strings[7])
- formatKeys = strings[8].split(':')
- formatValues = strings[9].split(':')
- for i in range(0,len(formatKeys) -1) :
- if formatKeys[i] == 'AD':
- ra = formatValues[i].split(',')
- infoParsed['RefDP'] = ra[0]
- infoParsed['AltDP'] = ra[1]
- if (int(ra[1]) + int(ra[0])) != 0:
- infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
- else:
- pass
- else:
- infoParsed[formatKeys[i]] = formatValues[i]
- infoParsed['chromo'] = strings[0]
- infoParsed['pos'] = strings[1]
- infoParsed['id'] = strings[2]
- infoParsed['ref'] = strings[3]
- infoParsed['alt'] = strings[4]
- infoParsed['qual'] = strings[5]
- return infoParsed
-
-
- def parse_INFO(info):
- strings = info.strip().split(';')
- keys = []
- values = []
- for i in strings:
- kv = i.split('=')
- if kv[0] == 'DB':
- keys.append('DB')
- values.append('1')
- elif kv[0] == 'AF':
- pass
- else:
- keys.append(kv[0])
- values.append(kv[1])
- infoDict = dict(zip(keys, values))
- return infoDict
-
-
- opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
- for op,value in opts:
- if op == "-i":
- inputFile=value
- elif op == "-o":
- outputFile=value
- elif op == "-h":
- usage()
- sys.exit()
-
- if len(sys.argv[1:]) < 3:
- usage()
- sys.exit()
-
- allDict = []
- for line in fileinput.input(inputFile):
- m = re.match('^\#',line)
- if m is not None:
- pass
- else:
- oneDict = process(line)
- allDict.append(oneDict)
-
- allTable = pd.DataFrame(allDict)
-
- allTable.to_csv(outputFile,sep='\t',index=False)
-
|