# import modules import sys, argparse, os import fileinput import re parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style") parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True) parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True) args = parser.parse_args() # Rename input: inputFile = args.familyVCF family_name = args.familyName # output filename LCL5_name = family_name + '.LCL5.vcf' LCL5file = open(LCL5_name,'w') LCL6_name = family_name + '.LCL6.vcf' LCL6file = open(LCL6_name,'w') LCL7_name = family_name + '.LCL7.vcf' LCL7file = open(LCL7_name,'w') LCL8_name = family_name + '.LCL8.vcf' LCL8file = open(LCL8_name,'w') family_filename = family_name + '.vcf' familyfile = open(family_filename,'w') # default columns, which will be included in the included in the calssifier vcfheader = '''##fileformat=VCFv4.2 ##FILTER= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ''' # write VCF LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n' LCL5file.write(vcfheader) LCL5file.write(LCL5colname) LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n' LCL6file.write(vcfheader) LCL6file.write(LCL6colname) LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n' LCL7file.write(vcfheader) LCL7file.write(LCL7colname) LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n' LCL8file.write(vcfheader) LCL8file.write(LCL8colname) familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n' familyfile.write(vcfheader) familyfile.write(familycolname) # reform VCF def process(oneLine): line = oneLine.rstrip() strings = line.strip().split('\t') # replace . # LCL6 uniq if strings[11] == '.': strings[11] = '0/0' strings[9] = strings[12] strings[10] = strings[13] else: pass # LCL5 uniq if strings[14] == '.': strings[14] = '0/0' strings[12] = strings[9] strings[13] = strings[10] else: pass # sister if strings[11] == strings[14]: add_format = ":1" else: add_format = ":0" # trioLCL5 if strings[15] == 'MD=1': add_format = add_format + ":1" else: add_format = add_format + ":0" # trioLCL6 if strings[7] == 'MD=1': add_format = add_format + ":1" else: add_format = add_format + ":0" # filter if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'): strings[6] = 'PASS' else: strings[6] = '.' # output LCL5 LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n' LCL5file.write(LCL5outLine) # output LCL6 LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n' LCL6file.write(LCL6outLine) # output LCL7 LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n' LCL7file.write(LCL7outLine) # output LCL8 LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n' LCL8file.write(LCL8outLine) # output family familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n' familyfile.write(familyoutLine) for line in fileinput.input(inputFile): m = re.match('^\#',line) if m is not None: pass else: process(line)