|
- import itertools
- import pandas as pd
- import sys,argparse,os
-
- parser = argparse.ArgumentParser(description="this script is to preform combination on vcf files, get any two of them for next jaccard index calculation")
-
- parser.add_argument('-vcf', '--vcffile', type=str, help='input vcf file', required=True)
-
- args = parser.parse_args()
-
- # Rename input:
- vcf_input = args.vcffile
-
-
- file = pd.read_csv(vcf_input,header=None,sep='\t')
- com = list(itertools.combinations(file[0],2))
-
- outfile = open('rtg_pairs.txt','w')
-
- for pair in com:
- # prepare a
- one = pair[0].strip().split('/')
- file_a = one[len(one)-1]
- a = one[len(one)-1].replace('.normed.vcf.gz','')
- strings_a = a.strip().split('_')
- sequenceTechA = strings_a[0] + strings_a[1]
- sequenceSiteA = strings_a[2]
- sampleA = strings_a[3]
- repA = strings_a[4]
- # analysisSiteA = strings_a[8]
- # pipelineA = strings_a[9] + strings_a[10]
-
- # prepare b
- two = pair[1].strip().split('/')
- file_b = two[len(two)-1]
- b = two[len(two)-1].replace('.normed.vcf.gz','')
- strings_b = b.strip().split('_')
- sequenceTechB = strings_b[0] + strings_b[1]
- sequenceSiteB = strings_b[2]
- sampleB = strings_b[3]
- repB = strings_b[4]
- # analysisSiteB = strings_b[8]
- # pipelineB = strings_b[9] + strings_b[10]
-
- folder = a + '-' + b
- ## add annotation
- # sequencing technology
- if (sequenceTechA != sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB):
- outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'sequenceTech' + '\n'
- outfile.write(outline)
- # sequencing site
- elif (sequenceTechA == sequenceTechB) and (sequenceSiteA != sequenceSiteB) and (sampleA == sampleB):
- outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'sequenceSite' + '\n'
- outfile.write(outline)
- # sample
- elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA != sampleB):
- outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'Sample' + '\n'
- outfile.write(outline)
- # replicate
- elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (repA != repB):
- outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'replicate' + '\n'
- outfile.write(outline)
- # analysis site
- # elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB):
- # outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'analysisSite' + '\n'
- # outfile.write(outline)
- # pipeline
- # elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (analysisSiteA == analysisSiteB) and (pipelineA != pipelineB):
- # outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'pipeline' + '\n'
- # outfile.write(outline)
- else:
- pass
-
- outfile.close()
|