You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
5.0KB

  1. from __future__ import division
  2. import pandas as pd
  3. import sys, argparse, os
  4. import fileinput
  5. import re
  6. # input arguments
  7. parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates")
  8. parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True)
  9. parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True)
  10. parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
  11. args = parser.parse_args()
  12. vcfInfo = args.vcfInfo
  13. mendelianInfo = args.mendelianInfo
  14. sample_name = args.sample_name
  15. #GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL
  16. vcf_header = '''##fileformat=VCFv4.2
  17. ##fileDate=20200331
  18. ##source=high_confidence_calls_intergration(choppy app)
  19. ##reference=GRCh38.d1.vd1
  20. #FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  21. #FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 for sister consistent, 0 for sister different">
  22. #FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation">
  23. #FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation">
  24. ##FORMAT=<ID=DP,Number=1,Type=Int,Description="Depth">
  25. ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
  26. ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality">
  27. ##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
  28. ##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality">
  29. ##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bia">
  30. ##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="variant quality">
  31. ##contig=<ID=chr1,length=248956422>
  32. ##contig=<ID=chr2,length=242193529>
  33. ##contig=<ID=chr3,length=198295559>
  34. ##contig=<ID=chr4,length=190214555>
  35. ##contig=<ID=chr5,length=181538259>
  36. ##contig=<ID=chr6,length=170805979>
  37. ##contig=<ID=chr7,length=159345973>
  38. ##contig=<ID=chr8,length=145138636>
  39. ##contig=<ID=chr9,length=138394717>
  40. ##contig=<ID=chr10,length=133797422>
  41. ##contig=<ID=chr11,length=135086622>
  42. ##contig=<ID=chr12,length=133275309>
  43. ##contig=<ID=chr13,length=114364328>
  44. ##contig=<ID=chr14,length=107043718>
  45. ##contig=<ID=chr15,length=101991189>
  46. ##contig=<ID=chr16,length=90338345>
  47. ##contig=<ID=chr17,length=83257441>
  48. ##contig=<ID=chr18,length=80373285>
  49. ##contig=<ID=chr19,length=58617616>
  50. ##contig=<ID=chr20,length=64444167>
  51. ##contig=<ID=chr21,length=46709983>
  52. ##contig=<ID=chr22,length=50818468>
  53. ##contig=<ID=chrX,length=156040895>
  54. '''
  55. # output file
  56. file_name = sample_name + '_mendelian_vcfInfo.vcf'
  57. outfile = open(file_name,'w')
  58. outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '\n'
  59. outfile.write(vcf_header)
  60. outfile.write(outputcolumn)
  61. # input files
  62. vcf_info = pd.read_table(vcfInfo)
  63. mendelian_info = pd.read_table(mendelianInfo)
  64. merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])
  65. merged_df = merged_df.fillna('.')
  66. #
  67. def parse_INFO(info):
  68. strings = info.strip().split(';')
  69. keys = []
  70. values = []
  71. for i in strings:
  72. kv = i.split('=')
  73. if kv[0] == 'DB':
  74. keys.append('DB')
  75. values.append('1')
  76. else:
  77. keys.append(kv[0])
  78. values.append(kv[1])
  79. infoDict = dict(zip(keys, values))
  80. return infoDict
  81. #
  82. for row in merged_df.itertuples():
  83. if row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5 != '.':
  84. # format
  85. # GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL
  86. FORMAT_x = row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518.split(':')
  87. ALT = int(FORMAT_x[1].split(',')[1])
  88. if int(FORMAT_x[2]) != 0:
  89. AF = round(ALT/int(FORMAT_x[2]),2)
  90. else:
  91. AF = '.'
  92. INFO_x = parse_INFO(row.INFO_x)
  93. if FORMAT_x[2] == '0':
  94. INFO_x['QD'] = '.'
  95. else:
  96. pass
  97. FORMAT = row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5 + ':' + FORMAT_x[2] + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x)
  98. # outline
  99. outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n'
  100. else:
  101. FORMAT_x = row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518.split(':')
  102. ALT = int(FORMAT_x[1].split(',')[1])
  103. if int(FORMAT_x[2]) != 0:
  104. AF = round(ALT/int(FORMAT_x[2]),2)
  105. else:
  106. AF = '.'
  107. INFO_x = parse_INFO(row.INFO_x)
  108. if FORMAT_x[2] == '0':
  109. INFO_x['QD'] = '.'
  110. else:
  111. pass
  112. FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x)
  113. # outline
  114. outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n'
  115. outfile.write(outline)