import itertools import pandas as pd import sys,argparse,os parser = argparse.ArgumentParser(description="this script is to preform combination on vcf files, get any two of three replicates for next jaccard index calculation") parser.add_argument('-vcf', '--vcffile', type=str, help='input vcf file', required=True) args = parser.parse_args() # Rename input: vcf_input = args.vcffile file = pd.read_csv(vcf_input,header=None,sep='\t') location_com = list(itertools.combinations(file[0],2)) name_com = list(itertools.combinations(file[1],2)) num = len(name_com) outfile = open('rtg_pairs.txt','w') for i in range(0,num): # prepare a one = name_com[i][0] strings_one = one.split('_') sequenceTechA = strings_one[0] + strings_one[1] sequenceSiteA = strings_one[2] sampleA = strings_one[3] repA = strings_one[4] # prepare b two = name_com[i][1] strings_two = two.split('_') sequenceTechB = strings_two[0] + strings_two[1] sequenceSiteB = strings_two[2] sampleB = strings_two[3] repB = strings_two[4] folder = one + '-' + two ## add annotation # sequencing technology if (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (repA != repB): outline = location_com[i][0] + '\t' + location_com[i][1] + '\t' + folder + '\n' outfile.write(outline) else: pass outfile.close()