|
- from __future__ import division
- import pandas as pd
- import sys, argparse, os
-
-
- # input arguments
- parser = argparse.ArgumentParser(description="this script is to calculate jaccard index")
-
- parser.add_argument('-i', '--mergedGVCF', type=str, help='merged gVCF txt with only chr, pos, ref, alt and genotypes', required=True)
- parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True)
-
-
- args = parser.parse_args()
- input_dat = args.mergedGVCF
- prefix = args.prefix
-
-
- # output file
- output_inter_name = prefix + '.inter.txt'
- output_union_name = prefix + '.union.txt'
-
-
- # input files
- dat = pd.read_table(input_dat)
-
- # output files
- sample_size = dat.shape[1]-2
- inter_number = pd.DataFrame(index=range(sample_size),columns=range(sample_size))
- union_number = pd.DataFrame(index=range(sample_size),columns=range(sample_size))
-
- for i in range(sample_size):
- oneSNV_GT = dat.iloc[:,0].astype(str) + '_' + dat.iloc[:,1].astype(str) + '_' + dat.iloc[:,i+2].astype(str)
- print(i+1)
- for j in range(sample_size):
- anotherSNV_GT = dat.iloc[:,0].astype(str) + '_' + dat.iloc[:,1].astype(str) + '_' + dat.iloc[:,j+2].astype(str)
- #remove './.' and '0/0'
- oneSNV_GT = [e for e in oneSNV_GT if './.' not in e]
- oneSNV_GT = [e for e in oneSNV_GT if '0/0' not in e]
- anotherSNV_GT = [e for e in anotherSNV_GT if './.' not in e]
- anotherSNV_GT = [e for e in anotherSNV_GT if '0/0' not in e]
- inter=set(oneSNV_GT).intersection(set(anotherSNV_GT))
- union=set(oneSNV_GT).union(set(anotherSNV_GT))
- inter_number.iloc[i,j] = len(inter)
- union_number.iloc[i,j] = len(union)
-
- inter_number.columns = dat.columns[2:dat.shape[1]]
- inter_number.index = dat.columns[2:dat.shape[1]]
- union_number.columns = dat.columns[2:dat.shape[1]]
- union_number.index = dat.columns[2:dat.shape[1]]
-
- inter_number.to_csv(output_inter_name,sep='\t')
- union_number.to_csv(output_union_name,sep='\t')
|