You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
5.2KB

  1. # import modules
  2. import sys, argparse, os
  3. import fileinput
  4. import re
  5. parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style")
  6. parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True)
  7. parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True)
  8. args = parser.parse_args()
  9. # Rename input:
  10. inputFile = args.familyVCF
  11. family_name = args.familyName
  12. # output filename
  13. LCL5_name = family_name + '.LCL5.vcf'
  14. LCL5file = open(LCL5_name,'w')
  15. LCL6_name = family_name + '.LCL6.vcf'
  16. LCL6file = open(LCL6_name,'w')
  17. LCL7_name = family_name + '.LCL7.vcf'
  18. LCL7file = open(LCL7_name,'w')
  19. LCL8_name = family_name + '.LCL8.vcf'
  20. LCL8file = open(LCL8_name,'w')
  21. family_filename = family_name + '.vcf'
  22. familyfile = open(family_filename,'w')
  23. # default columns, which will be included in the included in the calssifier
  24. vcfheader = '''##fileformat=VCFv4.2
  25. ##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678">
  26. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  27. ##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
  28. ##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
  29. ##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
  30. ##contig=<ID=chr1,length=248956422>
  31. ##contig=<ID=chr2,length=242193529>
  32. ##contig=<ID=chr3,length=198295559>
  33. ##contig=<ID=chr4,length=190214555>
  34. ##contig=<ID=chr5,length=181538259>
  35. ##contig=<ID=chr6,length=170805979>
  36. ##contig=<ID=chr7,length=159345973>
  37. ##contig=<ID=chr8,length=145138636>
  38. ##contig=<ID=chr9,length=138394717>
  39. ##contig=<ID=chr10,length=133797422>
  40. ##contig=<ID=chr11,length=135086622>
  41. ##contig=<ID=chr12,length=133275309>
  42. ##contig=<ID=chr13,length=114364328>
  43. ##contig=<ID=chr14,length=107043718>
  44. ##contig=<ID=chr15,length=101991189>
  45. ##contig=<ID=chr16,length=90338345>
  46. ##contig=<ID=chr17,length=83257441>
  47. ##contig=<ID=chr18,length=80373285>
  48. ##contig=<ID=chr19,length=58617616>
  49. ##contig=<ID=chr20,length=64444167>
  50. ##contig=<ID=chr21,length=46709983>
  51. ##contig=<ID=chr22,length=50818468>
  52. ##contig=<ID=chrX,length=156040895>
  53. '''
  54. # write VCF
  55. LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n'
  56. LCL5file.write(vcfheader)
  57. LCL5file.write(LCL5colname)
  58. LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n'
  59. LCL6file.write(vcfheader)
  60. LCL6file.write(LCL6colname)
  61. LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n'
  62. LCL7file.write(vcfheader)
  63. LCL7file.write(LCL7colname)
  64. LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n'
  65. LCL8file.write(vcfheader)
  66. LCL8file.write(LCL8colname)
  67. familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n'
  68. familyfile.write(vcfheader)
  69. familyfile.write(familycolname)
  70. # reform VCF
  71. def process(oneLine):
  72. line = oneLine.rstrip()
  73. strings = line.strip().split('\t')
  74. # replace .
  75. # LCL6 uniq
  76. if strings[11] == '.':
  77. strings[11] = '0/0'
  78. strings[9] = strings[12]
  79. strings[10] = strings[13]
  80. else:
  81. pass
  82. # LCL5 uniq
  83. if strings[14] == '.':
  84. strings[14] = '0/0'
  85. strings[12] = strings[9]
  86. strings[13] = strings[10]
  87. else:
  88. pass
  89. # sister
  90. if strings[11] == strings[14]:
  91. add_format = ":1"
  92. else:
  93. add_format = ":0"
  94. # trioLCL5
  95. if strings[15] == 'MD=1':
  96. add_format = add_format + ":1"
  97. else:
  98. add_format = add_format + ":0"
  99. # trioLCL6
  100. if strings[7] == 'MD=1':
  101. add_format = add_format + ":1"
  102. else:
  103. add_format = add_format + ":0"
  104. # filter
  105. if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'):
  106. strings[6] = 'PASS'
  107. else:
  108. strings[6] = '.'
  109. # output LCL5
  110. LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n'
  111. LCL5file.write(LCL5outLine)
  112. # output LCL6
  113. LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n'
  114. LCL6file.write(LCL6outLine)
  115. # output LCL7
  116. LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n'
  117. LCL7file.write(LCL7outLine)
  118. # output LCL8
  119. LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n'
  120. LCL8file.write(LCL8outLine)
  121. # output family
  122. familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n'
  123. familyfile.write(familyoutLine)
  124. for line in fileinput.input(inputFile):
  125. m = re.match('^\#',line)
  126. if m is not None:
  127. pass
  128. else:
  129. process(line)