You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

54 lines
1.9KB

  1. from __future__ import division
  2. import pandas as pd
  3. import sys, argparse, os
  4. # input arguments
  5. parser = argparse.ArgumentParser(description="this script is to calculate jaccard index")
  6. parser.add_argument('-i', '--mergedGVCF', type=str, help='merged gVCF txt with only chr, pos, ref, alt and genotypes', required=True)
  7. parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True)
  8. args = parser.parse_args()
  9. input_dat = args.mergedGVCF
  10. prefix = args.prefix
  11. # output file
  12. output_inter_name = prefix + '.inter.txt'
  13. output_union_name = prefix + '.union.txt'
  14. # input files
  15. dat = pd.read_table(input_dat)
  16. # output files
  17. sample_size = dat.shape[1]-2
  18. inter_number = pd.DataFrame(index=range(sample_size),columns=range(sample_size))
  19. union_number = pd.DataFrame(index=range(sample_size),columns=range(sample_size))
  20. for i in range(sample_size):
  21. oneSNV_GT = dat.iloc[:,0].astype(str) + '_' + dat.iloc[:,1].astype(str) + '_' + dat.iloc[:,i+2].astype(str)
  22. print(i+1)
  23. for j in range(sample_size):
  24. anotherSNV_GT = dat.iloc[:,0].astype(str) + '_' + dat.iloc[:,1].astype(str) + '_' + dat.iloc[:,j+2].astype(str)
  25. #remove './.' and '0/0'
  26. oneSNV_GT = [e for e in oneSNV_GT if './.' not in e]
  27. oneSNV_GT = [e for e in oneSNV_GT if '0/0' not in e]
  28. anotherSNV_GT = [e for e in anotherSNV_GT if './.' not in e]
  29. anotherSNV_GT = [e for e in anotherSNV_GT if '0/0' not in e]
  30. inter=set(oneSNV_GT).intersection(set(anotherSNV_GT))
  31. union=set(oneSNV_GT).union(set(anotherSNV_GT))
  32. inter_number.iloc[i,j] = len(inter)
  33. union_number.iloc[i,j] = len(union)
  34. inter_number.columns = dat.columns[2:dat.shape[1]]
  35. inter_number.index = dat.columns[2:dat.shape[1]]
  36. union_number.columns = dat.columns[2:dat.shape[1]]
  37. union_number.index = dat.columns[2:dat.shape[1]]
  38. inter_number.to_csv(output_inter_name,sep='\t')
  39. union_number.to_csv(output_union_name,sep='\t')