@@ -0,0 +1 @@ | |||
@@ -0,0 +1,68 @@ | |||
import sys,getopt | |||
import os | |||
import re | |||
import fileinput | |||
def usage(): | |||
print( | |||
""" | |||
Usage: python bed_for_bamReadcount.py -i input_vcf_file -o prefix | |||
This script selects SNPs and Indels supported by all callsets. | |||
Please notice that bam-readcount only takes in 1-based coordinates. | |||
Input: | |||
-i a vcf file | |||
Output: | |||
-o a indel bed file for bam-readcount | |||
""") | |||
# select supported small variants | |||
def process(oneLine): | |||
m = re.match('^\#',oneLine) | |||
if m is not None: | |||
pass | |||
else: | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
# convert the position to bed file for bam-readcount | |||
# deletion | |||
if len(strings[3]) > 1 and len(strings[4]) == 1: | |||
pos = int(strings[1]) + 1 | |||
outline = strings[0] + '\t' + str(pos) + '\t' + str(pos) + '\t' + strings[3] + '\t' + strings[4]+'\n' | |||
outINDEL.write(outline) | |||
# insertion | |||
elif len(strings[3]) == 1 and len(strings[4]) > 1 and (',' not in strings[4]): | |||
outline = strings[0] + '\t' + strings[1] + '\t' + strings[1] + '\t' + strings[3] + '\t' + strings[4] + '\n' | |||
outINDEL.write(outline) | |||
else: | |||
outMNP.write(oneLine) | |||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||
for op,value in opts: | |||
if op == "-i": | |||
inputFile=value | |||
elif op == "-o": | |||
prefix=value | |||
elif op == "-h": | |||
usage() | |||
sys.exit() | |||
if len(sys.argv[1:]) < 3: | |||
usage() | |||
sys.exit() | |||
INDELname = prefix + '.bed' | |||
MNPname = prefix + '_MNP.txt' | |||
outINDEL = open(INDELname,'w') | |||
outMNP = open(MNPname,'w') | |||
for line in fileinput.input(inputFile): | |||
process(line) | |||
outINDEL.close() | |||
outMNP.close() | |||
@@ -0,0 +1,96 @@ | |||
import sys,getopt | |||
import os | |||
import re | |||
import fileinput | |||
import pandas as pd | |||
def usage(): | |||
print( | |||
""" | |||
Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file | |||
This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files. | |||
Input: | |||
-i the selected vcf file | |||
Output: | |||
-o tab-delimited parsed file | |||
""") | |||
# select supported small variants | |||
def process(oneLine): | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
infoParsed = parse_INFO(strings[7]) | |||
formatKeys = strings[8].split(':') | |||
formatValues = strings[9].split(':') | |||
for i in range(0,len(formatKeys) -1) : | |||
if formatKeys[i] == 'AD': | |||
ra = formatValues[i].split(',') | |||
infoParsed['RefDP'] = ra[0] | |||
infoParsed['AltDP'] = ra[1] | |||
if (int(ra[1]) + int(ra[0])) != 0: | |||
infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0]))) | |||
else: | |||
pass | |||
else: | |||
infoParsed[formatKeys[i]] = formatValues[i] | |||
infoParsed['chromo'] = strings[0] | |||
infoParsed['pos'] = strings[1] | |||
infoParsed['id'] = strings[2] | |||
infoParsed['ref'] = strings[3] | |||
infoParsed['alt'] = strings[4] | |||
infoParsed['qual'] = strings[5] | |||
return infoParsed | |||
def parse_INFO(info): | |||
strings = info.strip().split(';') | |||
keys = [] | |||
values = [] | |||
for i in strings: | |||
kv = i.split('=') | |||
if kv[0] == 'DB': | |||
keys.append('DB') | |||
values.append('1') | |||
elif kv[0] == 'AF': | |||
pass | |||
elif kv[0] == 'POSITIVE_TRAIN_SITE': | |||
pass | |||
elif kv[0] == 'NEGATIVE_TRAIN_SITE': | |||
pass | |||
else: | |||
keys.append(kv[0]) | |||
values.append(kv[1]) | |||
infoDict = dict(zip(keys, values)) | |||
return infoDict | |||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||
for op,value in opts: | |||
if op == "-i": | |||
inputFile=value | |||
elif op == "-o": | |||
outputFile=value | |||
elif op == "-h": | |||
usage() | |||
sys.exit() | |||
if len(sys.argv[1:]) < 3: | |||
usage() | |||
sys.exit() | |||
allDict = [] | |||
for line in fileinput.input(inputFile): | |||
m = re.match('^\#',line) | |||
if m is not None: | |||
pass | |||
else: | |||
oneDict = process(line) | |||
allDict.append(oneDict) | |||
allTable = pd.DataFrame(allDict) | |||
allTable.to_csv(outputFile,sep='\t',index=False) | |||
@@ -0,0 +1,114 @@ | |||
# import modules | |||
import numpy as np | |||
import pandas as pd | |||
from sklearn import svm | |||
from sklearn import preprocessing | |||
import sys, argparse, os | |||
from vcf2bed import position_to_bed,padding_region | |||
parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome") | |||
parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True) | |||
parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True) | |||
parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True) | |||
parser.add_argument('-kernel', '--SVMkernel', type=str, help='kernel you choose to perform one class svm', required=True) | |||
parser.add_argument('-nu', '--SVMnu', type=float, help='An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
train_input = args.trainDataset | |||
test_input = args.testDataset | |||
sample_name = args.sampleName | |||
kernel = args.SVMkernel | |||
nu = args.SVMnu | |||
# default columns, which will be included in the included in the calssifier | |||
chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX'] | |||
feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af'] | |||
feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af'] | |||
# import datasets sepearate the records with or without BaseQRankSum annotation, etc. | |||
def load_dat(dat_file_name): | |||
dat = pd.read_table(dat_file_name) | |||
dat['DB'] = dat['DB'].fillna(0) | |||
dat = dat[dat['DP'] != 0] | |||
dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP']) | |||
homo_rows = dat[dat['BaseQRankSum'].isnull()] | |||
heter_rows = dat[dat['BaseQRankSum'].notnull()] | |||
return homo_rows,heter_rows | |||
train_homo,train_heter = load_dat(train_input) | |||
test_homo,test_heter = load_dat(test_input) | |||
clf = svm.OneClassSVM(nu=nu,kernel=kernel, gamma='auto_deprecated',cache_size=500) | |||
def prepare_dat(train_dat,test_dat,feature_cols,chromo): | |||
chr_train = train_dat[train_dat['chromo'] == chromo] | |||
chr_test = test_dat[test_dat['chromo'] == chromo] | |||
train_dat = chr_train.loc[:,feature_cols] | |||
test_dat = chr_test.loc[:,feature_cols] | |||
train_dat_scaled = preprocessing.scale(train_dat) | |||
test_dat_scaled = preprocessing.scale(test_dat) | |||
return chr_test,train_dat_scaled,test_dat_scaled | |||
def oneclass(X_train,X_test,chr_test): | |||
clf.fit(X_train) | |||
y_pred_test = clf.predict(X_test) | |||
test_true_dat = chr_test[y_pred_test == 1] | |||
test_false_dat = chr_test[y_pred_test == -1] | |||
return test_true_dat,test_false_dat | |||
predicted_true = pd.DataFrame(columns=train_homo.columns) | |||
predicted_false = pd.DataFrame(columns=train_homo.columns) | |||
for chromo in chromosome: | |||
# homo datasets | |||
chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo) | |||
test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo) | |||
predicted_true = predicted_true.append(test_true_homo) | |||
predicted_false = predicted_false.append(test_false_homo) | |||
# heter datasets | |||
chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo) | |||
test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter) | |||
predicted_true = predicted_true.append(test_true_heter) | |||
predicted_false = predicted_false.append(test_false_heter) | |||
predicted_true_filename = sample_name + '_predicted_true.txt' | |||
predicted_false_filename = sample_name + '_predicted_false.txt' | |||
predicted_true.to_csv(predicted_true_filename,sep='\t',index=False) | |||
predicted_false.to_csv(predicted_false_filename,sep='\t',index=False) | |||
# output the bed file and padding bed region 50bp | |||
predicted_true_bed_filename = sample_name + '_predicted_true.bed' | |||
predicted_false_bed_filename = sample_name + '_predicted_false.bed' | |||
padding_filename = sample_name + '_padding.bed' | |||
predicted_true_bed = open(predicted_true_bed_filename,'w') | |||
predicted_false_bed = open(predicted_false_bed_filename,'w') | |||
padding = open(padding_filename,'w') | |||
# | |||
for index,row in predicted_false.iterrows(): | |||
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) | |||
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' | |||
predicted_false_bed.write(outline_pos) | |||
chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50) | |||
outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n' | |||
outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n' | |||
padding.write(outline_pad_1) | |||
padding.write(outline_pad_2) | |||
for index,row in predicted_true.iterrows(): | |||
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) | |||
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' | |||
predicted_true_bed.write(outline_pos) | |||
@@ -0,0 +1,62 @@ | |||
import sys,getopt | |||
import os | |||
import re | |||
import fileinput | |||
def usage(): | |||
print( | |||
""" | |||
Usage: python select_small_variants_supported_by_all_callsets.py -i input_merged_vcf_file -o prefix | |||
This script selects SNPs and Indels supported by all callsets. | |||
Input: | |||
-i a merged vcf file | |||
Output: | |||
-o a vcf file containd the selected SNPs and Indels | |||
""") | |||
# select supported small variants | |||
def process(oneLine): | |||
m = re.match('^\#',oneLine) | |||
if m is not None: | |||
outVCF.write(oneLine) | |||
OUTname.write(oneLine) | |||
else: | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
gt = [i.split(':', 1)[0] for i in strings[9:len(strings)]] | |||
if all(e == gt[0] for e in gt) and (gt[0] != '.'): | |||
# output the record to vcf | |||
outVCF.write(oneLine) | |||
else: | |||
OUTname.write(oneLine) | |||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||
for op,value in opts: | |||
if op == "-i": | |||
inputFile=value | |||
elif op == "-o": | |||
prefix=value | |||
elif op == "-h": | |||
usage() | |||
sys.exit() | |||
if len(sys.argv[1:]) < 3: | |||
usage() | |||
sys.exit() | |||
VCFname = prefix + '.vcf' | |||
OUTname = prefix + '_outlier.vcf' | |||
outVCF = open(VCFname,'w') | |||
OUTname = open(OUTname,'w') | |||
for line in fileinput.input(inputFile): | |||
process(line) | |||
outVCF.close() | |||
OUTname.close() | |||
@@ -0,0 +1,36 @@ | |||
import re | |||
def position_to_bed(chromo,pos,ref,alt): | |||
# snv | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF | |||
if len(ref) == 1 and len(alt) == 1: | |||
StartPos = int(pos) -1 | |||
EndPos = int(pos) | |||
# deletions | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF + (reference length - alternate length) | |||
elif len(ref) > 1 and len(alt) == 1: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(ref) - 1) | |||
#insertions | |||
# For insertions: | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF + (alternate length - reference length) | |||
else: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(alt) - 1) | |||
return chromo,StartPos,EndPos | |||
def padding_region(chromo,pos1,pos2,padding): | |||
StartPos1 = pos1 - padding | |||
EndPos1 = pos1 | |||
StartPos2 = pos2 | |||
EndPos2 = pos2 + padding | |||
return chromo,StartPos1,EndPos1,StartPos2,EndPos2 |
@@ -0,0 +1,24 @@ | |||
{ | |||
"{{ project_name }}.giab_snv_idx": "oss://pgx-result/renluyao/model_validation/NIST.snv.vcf.gz.tbi", | |||
"{{ project_name }}.vcfeval.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | |||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||
"{{ project_name }}.snv_train": "{{ snv_train }}", | |||
"{{ project_name }}.giab_snv": "oss://pgx-result/renluyao/model_validation/NIST.snv.vcf.gz", | |||
"{{ project_name }}.disk_size": "100", | |||
"{{ project_name }}.nu": "{{ nu }}", | |||
"{{ project_name }}.ExtractVCFinfo.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/model_validation:v1.0", | |||
"{{ project_name }}.giab_indel": "oss://pgx-result/renluyao/model_validation/NIST.indel.vcf.gz", | |||
"{{ project_name }}.SepSnvIndel.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.indel_train": "{{ indel_train }}", | |||
"{{ project_name }}.giab_bed": "oss://pgx-result/renluyao/model_validation/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed", | |||
"{{ project_name }}.oneClass.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/model_validation:v1.0", | |||
"{{ project_name }}.indelNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.SepTrueFalse.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.kernel": "{{ kernel }}", | |||
"{{ project_name }}.sdf": "oss://chinese-quartet/quartet-storage-data/reference_data/GRCh38.d1.vd1.sdf/", | |||
"{{ project_name }}.cluster_config": "OnDemand bcs.a2.large img-ubuntu-vpc", | |||
"{{ project_name }}.nist_sample_name": "{{ nist_sample_name }}", | |||
"{{ project_name }}.nist_vcf": "{{ nist_vcf }}", | |||
"{{ project_name }}.giab_indel_idx": "oss://pgx-result/renluyao/model_validation/NIST.indel.vcf.gz.tbi", | |||
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | |||
} |
@@ -0,0 +1,26 @@ | |||
task ExtractVCFinfo { | |||
File nist_snv | |||
File nist_indel | |||
String sampleName | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/extract_vcf_information.py -i ${nist_snv} -o ${sampleName}.snv.txt | |||
python /opt/extract_vcf_information.py -i ${nist_indel} -o ${sampleName}.indel.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File snv_vcf = "${sampleName}.snv.txt" | |||
File indel_vcf = "${sampleName}.indel.txt" | |||
} | |||
} |
@@ -0,0 +1,21 @@ | |||
task KeepVar { | |||
File merged_vcf | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/select_small_variants_supported_by_all_callsets.py -i ${merged_vcf} -o all.selected | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File keeped_vcf = "all.selected.vcf" | |||
File outlier_vcf = "all.selected_outlier.vcf" | |||
} | |||
} |
@@ -0,0 +1,44 @@ | |||
task SepSnvIndel { | |||
File vcf | |||
String sampleName | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
cat ${vcf} | grep '#' > header | |||
cat ${vcf} | sed '/^#/d' | awk '$5!~/,/' > removed.body | |||
cat ${vcf} | sed '/^#/d' | awk '$5~/,/' > MNP.body | |||
cat header removed.body > ${sampleName}.MNPremoved.vcf | |||
cat header MNP.body > ${sampleName}.MNP.vcf | |||
rtg bgzip ${sampleName}.MNPremoved.vcf | |||
rtg index -f vcf ${sampleName}.MNPremoved.vcf.gz | |||
rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.snv.vcf.gz --snps-only | |||
rtg vcffilter -i ${sampleName}.MNPremoved.vcf.gz -o ${sampleName}.normed.indel.vcf.gz --non-snps-only | |||
gzip -d ${sampleName}.normed.snv.vcf.gz -c > ${sampleName}.normed.snv.vcf | |||
gzip -d ${sampleName}.normed.indel.vcf.gz -c > ${sampleName}.normed.indel.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File MNP="${sampleName}.MNP.vcf" | |||
File nist_snv_gz="${sampleName}.normed.snv.vcf.gz" | |||
File nist_snv_gz_idx="${sampleName}.normed.snv.vcf.gz.tbi" | |||
File nist_indel_gz="${sampleName}.normed.indel.vcf.gz" | |||
File nist_indel_gz_idx="${sampleName}.normed.indel.vcf.gz.tbi" | |||
File nist_snv="${sampleName}.normed.snv.vcf" | |||
File nist_indel="${sampleName}.normed.indel.vcf" | |||
} | |||
} |
@@ -0,0 +1,61 @@ | |||
task SepTrueFalse { | |||
File snv_true_bed | |||
File snv_false_bed | |||
File indel_true_bed | |||
File indel_false_bed | |||
File snv_padding | |||
File indel_padding | |||
File snv_gz | |||
File indel_gz | |||
File snv_idx | |||
File indel_idx | |||
String sampleName | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcffilter -i ${snv_gz} -o ${sampleName}.true.snv.vcf.gz --include-bed=${snv_true_bed} | |||
rtg vcffilter -i ${snv_gz} -o ${sampleName}.false.snv.vcf.gz --include-bed=${snv_false_bed} | |||
rtg vcffilter -i ${snv_gz} -o ${sampleName}.padding.snv.vcf.gz --include-bed=${snv_padding} | |||
rtg vcffilter -i ${indel_gz} -o ${sampleName}.true.indel.vcf.gz --include-bed=${indel_true_bed} | |||
rtg vcffilter -i ${indel_gz} -o ${sampleName}.false.indel.vcf.gz --include-bed=${indel_false_bed} | |||
rtg vcffilter -i ${indel_gz} -o ${sampleName}.padding.indel.vcf.gz --include-bed=${indel_padding} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File snv_true_vcf = "${sampleName}.true.snv.vcf.gz" | |||
File snv_true_vcf_index = "${sampleName}.true.snv.vcf.gz.tbi" | |||
File snv_false_vcf = "${sampleName}.false.snv.vcf.gz" | |||
File snv_false_vcf_index = "${sampleName}.false.snv.vcf.gz.tbi" | |||
File snv_padding_vcf = "${sampleName}.padding.snv.vcf.gz" | |||
File snv_padding_vcf_index = "${sampleName}.padding.snv.vcf.gz.tbi" | |||
File indel_true_vcf = "${sampleName}.true.indel.vcf.gz" | |||
File indel_true_vcf_index = "${sampleName}.true.indel.vcf.gz.tbi" | |||
File indel_false_vcf = "${sampleName}.false.indel.vcf.gz" | |||
File indel_false_vcf_index = "${sampleName}.false.indel.vcf.gz.tbi" | |||
File indel_padding_vcf = "${sampleName}.padding.indel.vcf.gz" | |||
File indel_padding_vcf_index = "${sampleName}.padding.indel.vcf.gz.tbi" | |||
} | |||
} | |||
@@ -0,0 +1,29 @@ | |||
task indelNorm { | |||
File nist_vcf | |||
File ref_dir | |||
String fasta | |||
String sampleName | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
cat ${nist_vcf} | grep '#' > header | |||
cat ${nist_vcf} | grep -v '#' > body | |||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered | |||
cat header body.filtered > ${sampleName}.filtered.vcf | |||
/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File normed_vcf = "${sampleName}.normed.vcf" | |||
} | |||
} |
@@ -0,0 +1,21 @@ | |||
task merge { | |||
Array[File] vcf_gz | |||
Array[File] vcf_index | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcfmerge --force-merge-all --no-gzip -o all.vcf ${sep=" " vcf_gz} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File merged_vcf = "all.vcf" | |||
} | |||
} |
@@ -0,0 +1,35 @@ | |||
task mergeBed { | |||
Array[File] snv_true_bed | |||
Array[File] snv_false_bed | |||
Array[File] indel_true_bed | |||
Array[File] indel_false_bed | |||
Array[File] indel_padding | |||
Array[File] snv_padding | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_true_bed} ${sep=" " indel_true_bed} > merged.true.bed | |||
/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_false_bed} ${sep=" " indel_false_bed} > merged.false.bed | |||
/opt/ccdg/bedtools-2.27.1/bin/bedtools multiinter -i ${sep=" " snv_padding} ${sep=" " indel_padding} > merged.padding.bed | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File true_bed = "merged.true.bed" | |||
File false_bed = "merged.false.bed" | |||
File padding = "merged.padding.bed" | |||
} | |||
} | |||
@@ -0,0 +1,61 @@ | |||
task mergeVCF { | |||
Array[File] snv_true_vcf | |||
Array[File] snv_true_vcf_index | |||
Array[File] snv_false_vcf | |||
Array[File] snv_false_vcf_index | |||
Array[File] snv_remain_vcf | |||
Array[File] snv_remain_vcf_index | |||
Array[File] snv_padding_vcf | |||
Array[File] snv_padding_vcf_index | |||
Array[File] indel_true_vcf | |||
Array[File] indel_true_vcf_index | |||
Array[File] indel_false_vcf | |||
Array[File] indel_false_vcf_index | |||
Array[File] indel_remain_vcf | |||
Array[File] indel_remain_vcf_index | |||
Array[File] indel_padding_vcf | |||
Array[File] indel_padding_vcf_index | |||
String quartet_sample | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.true.vcf.gz ${sep=" " snv_true_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.false.vcf.gz ${sep=" " snv_false_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.remain.vcf.gz ${sep=" " snv_remain_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.snv.padding.vcf.gz ${sep=" " snv_padding_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.true.vcf.gz ${sep=" " indel_true_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.false.vcf.gz ${sep=" " indel_false_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.remain.vcf.gz ${sep=" " indel_remain_vcf} | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${quartet_sample}.indel.padding.vcf.gz ${sep=" " indel_padding_vcf} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File merged_snv_true = "${quartet_sample}.snv.true.vcf.gz" | |||
File merged_snv_false = "${quartet_sample}.snv.false.vcf.gz" | |||
File merged_snv_remain = "${quartet_sample}.snv.remain.vcf.gz" | |||
File merged_snv_padding = "${quartet_sample}.snv.padding.vcf.gz" | |||
File merged_indel_true = "${quartet_sample}.indel.true.vcf.gz" | |||
File merged_indel_false = "${quartet_sample}.indel.false.vcf.gz" | |||
File merged_indel_remain = "${quartet_sample}.indel.remain.vcf.gz" | |||
File merged_indel_padding = "${quartet_sample}.indel.padding.vcf.gz" | |||
} | |||
} |
@@ -0,0 +1,41 @@ | |||
task oneClass { | |||
File snv_vcf | |||
File indel_vcf | |||
File snv_train | |||
File indel_train | |||
String kernel | |||
String nu | |||
String sampleName | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/oneClass.py -train ${snv_train} -test ${snv_vcf} -name ${sampleName}_snv -kernel ${kernel} -nu ${nu} | |||
python /opt/oneClass.py -train ${indel_train} -test ${indel_vcf} -name ${sampleName}_indel -kernel ${kernel} -nu ${nu} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File snv_true_txt = "${sampleName}_snv_predicted_true.txt" | |||
File snv_false_txt = "${sampleName}_snv_predicted_false.txt" | |||
File snv_true_bed = "${sampleName}_snv_predicted_true.bed" | |||
File snv_false_bed = "${sampleName}_snv_predicted_false.bed" | |||
File snv_padding = "${sampleName}_snv_padding.bed" | |||
File indel_true_txt = "${sampleName}_indel_predicted_true.txt" | |||
File indel_false_txt = "${sampleName}_indel_predicted_false.txt" | |||
File indel_true_bed = "${sampleName}_indel_predicted_true.bed" | |||
File indel_false_bed = "${sampleName}_indel_predicted_false.bed" | |||
File indel_padding = "${sampleName}_indel_padding.bed" | |||
} | |||
} | |||
@@ -0,0 +1,55 @@ | |||
task vcfeval { | |||
File snv_true_vcf | |||
File snv_true_vcf_index | |||
File snv_false_vcf | |||
File snv_false_vcf_index | |||
File indel_true_vcf | |||
File indel_true_vcf_index | |||
File indel_false_vcf | |||
File indel_false_vcf_index | |||
File giab_bed | |||
File giab_snv | |||
File giab_snv_idx | |||
File giab_indel | |||
File giab_indel_idx | |||
File sdf | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
set -o pipefail | |||
set -e | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b ${giab_snv} -c ${snv_true_vcf} -o snv_true -t ${sdf} | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b ${giab_snv} -c ${snv_false_vcf} -o snv_false -t ${sdf} | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b ${giab_indel} -c ${indel_true_vcf} -o indel_true -t ${sdf} | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b ${giab_indel} -c ${indel_false_vcf} -o indel_false -t ${sdf} | |||
tar -zcvf snv_true.zip snv_true | |||
tar -zcvf snv_false.zip snv_false | |||
tar -zcvf indel_true.zip indel_true | |||
tar -zcvf indel_false.zip indel_false | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File snv_true_zip="snv_true.zip" | |||
File snv_false_zip="snv_false.zip" | |||
File indel_true_zip="indel_true.zip" | |||
File indel_false_zip="indel_false.zip" | |||
} | |||
} |
@@ -0,0 +1,23 @@ | |||
task zipIndex { | |||
File normed_vcf | |||
String sampleName = basename(normed_vcf,".normed.vcf") | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg bgzip ${normed_vcf} -c > ${sampleName}.normed.vcf.gz | |||
rtg index -f vcf ${sampleName}.normed.vcf.gz | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File vcf_gz = "${sampleName}.normed.vcf.gz" | |||
File vcf_index = "${sampleName}.normed.vcf.gz.tbi" | |||
} | |||
} |
@@ -0,0 +1,105 @@ | |||
import "./tasks/indelNorm.wdl" as indelNorm | |||
import "./tasks/SepSnvIndel.wdl" as SepSnvIndel | |||
import "./tasks/ExtractVCFinfo.wdl" as ExtractVCFinfo | |||
import "./tasks/oneClass.wdl" as oneClass | |||
import "./tasks/SepTrueFalse.wdl" as SepTrueFalse | |||
import "./tasks/vcfeval.wdl" as vcfeval | |||
workflow {{ project_name }} { | |||
File nist_vcf | |||
File giab_bed | |||
File giab_snv | |||
File giab_snv_idx | |||
File giab_indel | |||
File giab_indel_idx | |||
File snv_train | |||
File indel_train | |||
File ref_dir | |||
File sdf | |||
String kernel | |||
String nu | |||
String nist_sample_name | |||
String cluster_config | |||
String disk_size | |||
String fasta | |||
call indelNorm.indelNorm as indelNorm { | |||
input: | |||
nist_vcf=nist_vcf, | |||
sampleName=nist_sample_name, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
disk_size=disk_size, | |||
cluster_config=cluster_config | |||
} | |||
call SepSnvIndel.SepSnvIndel as SepSnvIndel { | |||
input: | |||
vcf=indelNorm.normed_vcf, | |||
sampleName=nist_sample_name, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call ExtractVCFinfo.ExtractVCFinfo as ExtractVCFinfo { | |||
input: | |||
nist_snv=SepSnvIndel.nist_snv, | |||
nist_indel=SepSnvIndel.nist_indel, | |||
sampleName=nist_sample_name, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call oneClass.oneClass as oneClass { | |||
input: | |||
snv_vcf=ExtractVCFinfo.snv_vcf, | |||
indel_vcf=ExtractVCFinfo.indel_vcf, | |||
snv_train=snv_train, | |||
indel_train=indel_train, | |||
kernel=kernel, | |||
nu=nu, | |||
sampleName=nist_sample_name, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call SepTrueFalse.SepTrueFalse as SepTrueFalse { | |||
input: | |||
snv_true_bed=oneClass.snv_true_bed, | |||
snv_false_bed=oneClass.snv_false_bed, | |||
indel_true_bed=oneClass.indel_true_bed, | |||
indel_false_bed=oneClass.indel_false_bed, | |||
snv_padding=oneClass.snv_padding, | |||
indel_padding=oneClass.indel_padding, | |||
snv_gz=SepSnvIndel.nist_snv_gz, | |||
indel_gz=SepSnvIndel.nist_indel_gz, | |||
snv_idx=SepSnvIndel.nist_snv_gz_idx, | |||
indel_idx=SepSnvIndel.nist_indel_gz_idx, | |||
sampleName=nist_sample_name, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call vcfeval.vcfeval as vcfeval { | |||
input: | |||
snv_true_vcf=SepTrueFalse.snv_true_vcf, | |||
snv_true_vcf_index=SepTrueFalse.snv_true_vcf_index, | |||
snv_false_vcf=SepTrueFalse.snv_false_vcf, | |||
snv_false_vcf_index=SepTrueFalse.snv_false_vcf_index, | |||
indel_true_vcf=SepTrueFalse.indel_true_vcf, | |||
indel_true_vcf_index=SepTrueFalse.indel_true_vcf_index, | |||
indel_false_vcf=SepTrueFalse.indel_false_vcf, | |||
indel_false_vcf_index=SepTrueFalse.indel_false_vcf_index, | |||
giab_bed=giab_bed, | |||
giab_snv=giab_snv, | |||
giab_snv_idx=giab_snv_idx, | |||
giab_indel=giab_indel, | |||
giab_indel_idx=giab_indel_idx, | |||
sdf=sdf, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
} |