chenhaonan
/
quartet_dna_quality_control_wes_big_pipeline
forked from renluyao/quartet_dna_quality_control_wes_big_pipeline


			
							# import modules
import sys, argparse, os
import fileinput
import re

parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style")

parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation',  required=True)
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file',  required=True)

args = parser.parse_args()

# Rename input:
inputFile = args.familyVCF
family_name = args.familyName

# output filename
LCL5_name = family_name + '.LCL5.vcf'
LCL5file = open(LCL5_name,'w')
LCL6_name = family_name + '.LCL6.vcf'
LCL6file = open(LCL6_name,'w')
LCL7_name = family_name + '.LCL7.vcf'
LCL7file = open(LCL7_name,'w')
LCL8_name = family_name + '.LCL8.vcf'
LCL8file = open(LCL8_name,'w')
family_filename = family_name + '.vcf'
familyfile = open(family_filename,'w')

# default columns, which will be included in the included in the calssifier
vcfheader = '''##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# write VCF
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n'
LCL5file.write(vcfheader)
LCL5file.write(LCL5colname)

LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n'
LCL6file.write(vcfheader)
LCL6file.write(LCL6colname)

LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n'
LCL7file.write(vcfheader)
LCL7file.write(LCL7colname)

LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n'
LCL8file.write(vcfheader)
LCL8file.write(LCL8colname)

familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n'
familyfile.write(vcfheader)
familyfile.write(familycolname)

# reform VCF
def process(oneLine):
	line = oneLine.rstrip()
	strings = line.strip().split('\t')
	# replace .
	# LCL6 uniq
	if strings[11] == '.':
		strings[11] = '0/0'
		strings[9] = strings[12]
		strings[10] = strings[13]
	else:
		pass
	# LCL5 uniq
	if strings[14] == '.':
		strings[14] = '0/0'
		strings[12] = strings[9]
		strings[13] = strings[10]
	else:
		pass
	# sister
	if strings[11] == strings[14]:
		add_format = ":1"
	else:
		add_format = ":0"
	# trioLCL5
	if strings[15] == 'MD=1':
		add_format = add_format + ":1"
	else:
		add_format = add_format + ":0"
	# trioLCL6
	if strings[7] == 'MD=1':
		add_format = add_format + ":1"
	else:
		add_format = add_format + ":0"
	# filter
	if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'):
		strings[6] = 'PASS'
	else:
		strings[6] = '.'
	# output LCL5
	LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n'
	LCL5file.write(LCL5outLine)
	# output LCL6
	LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n'
	LCL6file.write(LCL6outLine)
	# output LCL7
	LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n'
	LCL7file.write(LCL7outLine)
	# output LCL8
	LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n'
	LCL8file.write(LCL8outLine)
	# output family
	familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n'
	familyfile.write(familyoutLine)


for line in fileinput.input(inputFile):
	m = re.match('^\#',line)
	if m is not None:
		pass
	else:
		process(line)