You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

merge_two_family.py 1.9KB

4 yıl önce
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. from __future__ import division
  2. import pandas as pd
  3. import sys, argparse, os
  4. import fileinput
  5. import re
  6. # input arguments
  7. parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information")
  8. parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True)
  9. parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True)
  10. parser.add_argument('-family', '--family', type=str, help='family name', required=True)
  11. args = parser.parse_args()
  12. lcl5 = args.LCL5
  13. lcl6 = args.LCL6
  14. family = args.family
  15. # output file
  16. family_name = family + '.txt'
  17. family_file = open(family_name,'w')
  18. # input files
  19. lcl5_dat = pd.read_table(lcl5)
  20. lcl6_dat = pd.read_table(lcl6)
  21. merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])
  22. def alt_seq(alt, genotype):
  23. if genotype == './.':
  24. seq = './.'
  25. elif genotype == '0/0':
  26. seq = '0/0'
  27. else:
  28. alt = alt.split(',')
  29. genotype = genotype.split('/')
  30. if genotype[0] == '0':
  31. allele2 = alt[int(genotype[1]) - 1]
  32. seq = '0/' + allele2
  33. else:
  34. allele1 = alt[int(genotype[0]) - 1]
  35. allele2 = alt[int(genotype[1]) - 1]
  36. seq = allele1 + '/' + allele2
  37. return seq
  38. for row in merged_df.itertuples():
  39. # correction of multiallele
  40. if pd.isnull(row.INFO_x) == True or pd.isnull(row.INFO_y) == True:
  41. mendelian = '.'
  42. else:
  43. lcl5_seq = alt_seq(row.ALT_x, row.CHILD_x)
  44. lcl6_seq = alt_seq(row.ALT_y, row.CHILD_y)
  45. if lcl5_seq == lcl6_seq:
  46. mendelian = '1'
  47. else:
  48. mendelian = '0'
  49. if pd.isnull(row.INFO_x) == True:
  50. mendelian = mendelian + ':.'
  51. else:
  52. mendelian = mendelian + ':' + row.INFO_x.split('=')[1]
  53. if pd.isnull(row.INFO_y) == True:
  54. mendelian = mendelian + ':.'
  55. else:
  56. mendelian = mendelian + ':' + row.INFO_y.split('=')[1]
  57. outline = row._1 + '\t' + str(row.POS) + '\t' + mendelian + '\n'
  58. family_file.write(outline)