Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

43 lines
2.2KB

  1. from __future__ import division
  2. import sys, argparse, os
  3. import pandas as pd
  4. from collections import Counter
  5. # input arguments
  6. parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")
  7. parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True)
  8. args = parser.parse_args()
  9. vcf = args.vcf
  10. lcl5_outfile = open('LCL5_all_variants.txt','w')
  11. filtered_outfile = open('LCL5_filtered_variants.txt','w')
  12. vcf_dat = pd.read_table(vcf)
  13. for row in vcf_dat.itertuples():
  14. lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
  15. row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
  16. row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
  17. row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
  18. row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
  19. row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
  20. row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
  21. row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
  22. row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
  23. lcl5_vcf_gt = [x.split(':')[0] for x in lcl5_list]
  24. lcl5_gt=[item.replace('./.', '0/0') for item in lcl5_vcf_gt]
  25. gt_dict = Counter(lcl5_gt)
  26. highest_gt = gt_dict.most_common(1)
  27. candidate_gt = highest_gt[0][0]
  28. freq_gt = highest_gt[0][1]
  29. output = row._1 + '\t' + str(row.POS) + '\t' + '\t'.join(lcl5_gt) + '\n'
  30. if (candidate_gt == '0/0') and (freq_gt == 27):
  31. filtered_outfile.write(output)
  32. else:
  33. lcl5_outfile.write(output)