@@ -0,0 +1,95 @@ | |||
#高置信突变位点的整合 | |||
> Author: Run Luyao | |||
> | |||
> E-mail:18110700050@fudan.edu.cn | |||
> | |||
> Git: | |||
> | |||
> Last Updates: 18/03/2020 | |||
## 安装指南 | |||
```bash | |||
# 激活choppy环境 | |||
source activate choppy | |||
# 安装app | |||
choppy install renluyao/high_confidence_calls_manuscript | |||
``` | |||
## App概述 | |||
中华家系1号全基因组高置信small variants(SNVs和Indels)的整合流程。 | |||
## 流程与参数 | |||
1. variantsNorm | |||
保留chr1-22,X上的突变 | |||
用bcftools norm进行突变格式的统一 | |||
2. mendelian | |||
LCL5、LCL7和LCL8为三口之家,进行trio-analysis,分析符合孟德尔和不符合孟德尔遗传规律的突变位点 | |||
LCL6、LCL7和LCL8为三口之家,进行trio-analysis,分析符合孟德尔和不符合孟德尔遗传规律的突变位点 | |||
得到LCL5和LCL6两个家系合并的vcf文件 | |||
3. zipIndex | |||
对LCL5和LCL6两个家系合并的文件压缩和检索引 | |||
4. VCFrename | |||
将VBT的输出结果,VCF文件中的MOTHER FATHER CHILD改成对应的样本名 | |||
5. mergeSister | |||
将LCL5和LCL6修改过名字后的家系VCF文件合并 | |||
6. reformVCF | |||
根据两个三口之家和姐妹的孟德尔遗传的信息,将之前和合并的VCF分解成4个人的vcf,并且包含了家系遗传的信息 | |||
7. familyzipIndex | |||
将上一步输出的4个文件进行压缩和检索引 | |||
8. merge | |||
将所有注释后的LCL5 vcf文件合并 | |||
将所有注释后的LCL6 vcf文件合并 | |||
将所有注释后的LCL7 vcf文件合并 | |||
将所有注释后的LCL8 vcf文件合并 | |||
9. vote | |||
投票选择高置信的突变位点 | |||
t | |||
## App输入变量与输入文件 | |||
## App输出文件 | |||
## 结果展示与解读 | |||
## CHANGELOG | |||
## FAQ | |||
@@ -0,0 +1,68 @@ | |||
import sys,getopt | |||
import os | |||
import re | |||
import fileinput | |||
def usage(): | |||
print( | |||
""" | |||
Usage: python bed_for_bamReadcount.py -i input_vcf_file -o prefix | |||
This script selects SNPs and Indels supported by all callsets. | |||
Please notice that bam-readcount only takes in 1-based coordinates. | |||
Input: | |||
-i a vcf file | |||
Output: | |||
-o a indel bed file for bam-readcount | |||
""") | |||
# select supported small variants | |||
def process(oneLine): | |||
m = re.match('^\#',oneLine) | |||
if m is not None: | |||
pass | |||
else: | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
# convert the position to bed file for bam-readcount | |||
# deletion | |||
if len(strings[3]) > 1 and len(strings[4]) == 1: | |||
pos = int(strings[1]) + 1 | |||
outline = strings[0] + '\t' + str(pos) + '\t' + str(pos) + '\t' + strings[3] + '\t' + strings[4]+'\n' | |||
outINDEL.write(outline) | |||
# insertion | |||
elif len(strings[3]) == 1 and len(strings[4]) > 1 and (',' not in strings[4]): | |||
outline = strings[0] + '\t' + strings[1] + '\t' + strings[1] + '\t' + strings[3] + '\t' + strings[4] + '\n' | |||
outINDEL.write(outline) | |||
else: | |||
outMNP.write(oneLine) | |||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||
for op,value in opts: | |||
if op == "-i": | |||
inputFile=value | |||
elif op == "-o": | |||
prefix=value | |||
elif op == "-h": | |||
usage() | |||
sys.exit() | |||
if len(sys.argv[1:]) < 3: | |||
usage() | |||
sys.exit() | |||
INDELname = prefix + '.bed' | |||
MNPname = prefix + '_MNP.txt' | |||
outINDEL = open(INDELname,'w') | |||
outMNP = open(MNPname,'w') | |||
for line in fileinput.input(inputFile): | |||
process(line) | |||
outINDEL.close() | |||
outMNP.close() | |||
@@ -0,0 +1,96 @@ | |||
import sys,getopt | |||
import os | |||
import re | |||
import fileinput | |||
import pandas as pd | |||
def usage(): | |||
print( | |||
""" | |||
Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file | |||
This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files. | |||
Input: | |||
-i the selected vcf file | |||
Output: | |||
-o tab-delimited parsed file | |||
""") | |||
# select supported small variants | |||
def process(oneLine): | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
infoParsed = parse_INFO(strings[7]) | |||
formatKeys = strings[8].split(':') | |||
formatValues = strings[9].split(':') | |||
for i in range(0,len(formatKeys) -1) : | |||
if formatKeys[i] == 'AD': | |||
ra = formatValues[i].split(',') | |||
infoParsed['RefDP'] = ra[0] | |||
infoParsed['AltDP'] = ra[1] | |||
if (int(ra[1]) + int(ra[0])) != 0: | |||
infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0]))) | |||
else: | |||
pass | |||
else: | |||
infoParsed[formatKeys[i]] = formatValues[i] | |||
infoParsed['chromo'] = strings[0] | |||
infoParsed['pos'] = strings[1] | |||
infoParsed['id'] = strings[2] | |||
infoParsed['ref'] = strings[3] | |||
infoParsed['alt'] = strings[4] | |||
infoParsed['qual'] = strings[5] | |||
return infoParsed | |||
def parse_INFO(info): | |||
strings = info.strip().split(';') | |||
keys = [] | |||
values = [] | |||
for i in strings: | |||
kv = i.split('=') | |||
if kv[0] == 'DB': | |||
keys.append('DB') | |||
values.append('1') | |||
elif kv[0] == 'AF': | |||
pass | |||
elif kv[0] == 'POSITIVE_TRAIN_SITE': | |||
pass | |||
elif kv[0] == 'NEGATIVE_TRAIN_SITE': | |||
pass | |||
else: | |||
keys.append(kv[0]) | |||
values.append(kv[1]) | |||
infoDict = dict(zip(keys, values)) | |||
return infoDict | |||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||
for op,value in opts: | |||
if op == "-i": | |||
inputFile=value | |||
elif op == "-o": | |||
outputFile=value | |||
elif op == "-h": | |||
usage() | |||
sys.exit() | |||
if len(sys.argv[1:]) < 3: | |||
usage() | |||
sys.exit() | |||
allDict = [] | |||
for line in fileinput.input(inputFile): | |||
m = re.match('^\#',line) | |||
if m is not None: | |||
pass | |||
else: | |||
oneDict = process(line) | |||
allDict.append(oneDict) | |||
allTable = pd.DataFrame(allDict) | |||
allTable.to_csv(outputFile,sep='\t',index=False) | |||
@@ -0,0 +1,255 @@ | |||
# import modules | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
import pandas as pd | |||
from operator import itemgetter | |||
from collections import Counter | |||
from itertools import islice | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to count voting number") | |||
parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True) | |||
parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True) | |||
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) | |||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||
args = parser.parse_args() | |||
multi_sample_vcf = args.multi_sample_vcf | |||
dup_list = args.dup_list | |||
prefix = args.prefix | |||
sample_name = args.sample_name | |||
vcf_header = '''##fileformat=VCFv4.2 | |||
##fileDate=20191224 | |||
##source=high_confidence_calls_intergration(choppy app) | |||
##reference=GRCh38.d1.vd1 | |||
##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes"> | |||
##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes"> | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
# read in duplication list | |||
dup = pd.read_table(dup_list,header=None) | |||
var_dup = dup[0].tolist() | |||
# output file | |||
file_name = prefix + '_annotated.vcf' | |||
outfile = open(file_name,'w') | |||
# write VCF | |||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n' | |||
outfile.write(vcf_header) | |||
outfile.write(outputcolumn) | |||
#function | |||
def detected_percentage(strings): | |||
strings = [x.replace('0/0','.') for x in strings] | |||
gt = [x.split(':')[0] for x in strings] | |||
percentage = round((33 - gt.count('.'))/33,4) | |||
return(str(percentage)) | |||
def vote_percentage(strings,consensus_call): | |||
strings = [x.replace('.','0/0') for x in strings] | |||
gt = [x.split(':')[0] for x in strings] | |||
gt = list(map(gt_uniform,[i for i in gt])) | |||
percentage = round(gt.count(consensus_call)/33,4) | |||
return(str(percentage)) | |||
def family_vote(strings,consensus_call): | |||
pass | |||
def gt_uniform(strings): | |||
uniformed_gt = '' | |||
allele1 = strings.split('/')[0] | |||
allele2 = strings.split('/')[1] | |||
if int(allele1) > int(allele2): | |||
uniformed_gt = allele2 + '/' + allele1 | |||
else: | |||
uniformed_gt = allele1 + '/' + allele2 | |||
return uniformed_gt | |||
def decide_by_rep(strings): | |||
consensus_rep = '' | |||
mendelian = [x[-5:] for x in strings] | |||
strings = [x.replace('.','0/0') for x in strings] | |||
gt = [x.split(':')[0] for x in strings] | |||
# modified gt turn 2/1 to 1/2 | |||
gt = list(map(gt_uniform,[i for i in gt])) | |||
# mendelian consistent? | |||
mendelian_dict = Counter(mendelian) | |||
highest_mendelian = mendelian_dict.most_common(1) | |||
candidate_mendelian = highest_mendelian[0][0] | |||
freq_mendelian = highest_mendelian[0][1] | |||
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2): | |||
gt_num_dict = Counter(gt) | |||
highest_gt = gt_num_dict.most_common(1) | |||
candidate_gt = highest_gt[0][0] | |||
freq_gt = highest_gt[0][1] | |||
if (candidate_gt != '0/0') and (freq_gt >= 2): | |||
consensus_rep = candidate_gt | |||
elif (candidate_gt == '0/0') and (freq_gt >= 2): | |||
consensus_rep = '0/0' | |||
else: | |||
consensus_rep = 'inconGT' | |||
elif (candidate_mendelian == '.') and (freq_mendelian >= 2): | |||
consensus_rep = 'noInfo' | |||
else: | |||
consensus_rep = 'inconMen' | |||
return consensus_rep | |||
def main(): | |||
for line in fileinput.input(multi_sample_vcf): | |||
headline = re.match('^\#',line) | |||
if headline is not None: | |||
pass | |||
else: | |||
line = line.strip() | |||
strings = line.split('\t') | |||
variant_id = '_'.join([strings[0],strings[1]]) | |||
# check if the variants location is duplicated | |||
if variant_id in var_dup: | |||
strings[6] = 'dupVar' | |||
outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n' | |||
outfile.write(outLine) | |||
else: | |||
# pre-define | |||
pcr_consensus = '' | |||
pcr_free_consensus = '' | |||
consensus_call = '' | |||
consensus_alt_seq = '.' | |||
# pcr | |||
pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings) | |||
SEQ2000 = decide_by_rep(pcr[0:3]) | |||
SEQ500 = decide_by_rep(pcr[4:7]) | |||
Nova = decide_by_rep(pcr[7:11]) | |||
XTen_ARD = decide_by_rep(pcr[11:14]) | |||
XTen_NVG = decide_by_rep(pcr[14:17]) | |||
XTen_WUX_1 = decide_by_rep(pcr[17:20]) | |||
XTen_WUX_2 = decide_by_rep(pcr[20:23]) | |||
sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2] | |||
sequence_dict = Counter(sequence_site) | |||
highest_sequence = sequence_dict.most_common(1) | |||
candidate_sequence = highest_sequence[0][0] | |||
freq_sequence = highest_sequence[0][1] | |||
if freq_sequence > 4: | |||
pcr_consensus = candidate_sequence | |||
else: | |||
pcr_consensus = 'inconSequenceSite' | |||
# pcr-free | |||
pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings) | |||
#SEQ2000 = decide_by_rep(pcr_free[0]) | |||
Nova_ARD_1 = decide_by_rep(pcr_free[1:4]) | |||
Nova_ARD_2 = decide_by_rep(pcr_free[4:7]) | |||
Nova_BRG = decide_by_rep(pcr_free[7:10]) | |||
sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG] | |||
highest_sequence = sequence_dict.most_common(1) | |||
candidate_sequence = highest_sequence[0][0] | |||
freq_sequence = highest_sequence[0][1] | |||
if freq_sequence > 2: | |||
pcr_free_consensus = candidate_sequence | |||
else: | |||
pcr_free_consensus = 'inconSequenceSite' | |||
# pcr and pcr-free | |||
tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] | |||
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): | |||
consensus_call = pcr_consensus | |||
VPCT = vote_percentage(strings[9:],consensus_call) | |||
strings[7] = 'VPCT=' + VPCT | |||
DPCT = detected_percentage(strings[9:]) | |||
strings[7] = strings[7] + ';DPCT=' + DPCT | |||
# Delete multiple alternative genotype to necessary expression | |||
strings[6] = 'reproducible' | |||
alt = strings[4] | |||
alt_gt = alt.split(',') | |||
if len(alt_gt) > 1: | |||
allele1 = consensus_call.split('/')[0] | |||
allele2 = consensus_call.split('/')[1] | |||
if allele1 == '0': | |||
allele2_seq = alt_gt[int(allele2) - 1] | |||
consensus_alt_seq = allele2_seq | |||
consensus_call = '0/1' | |||
else: | |||
allele1_seq = alt_gt[int(allele1) - 1] | |||
allele2_seq = alt_gt[int(allele2) - 1] | |||
if int(allele1) > int(allele2): | |||
consensus_alt_seq = allele2_seq + ',' + allele1_seq | |||
consensus_call = '1/2' | |||
elif int(allele1) < int(allele2): | |||
consensus_alt_seq = allele1_seq + ',' + allele2_seq | |||
consensus_call = '1/2' | |||
else: | |||
consensus_alt_seq = allele1_seq | |||
consensus_call = '1/1' | |||
else: | |||
consensus_alt_seq = alt | |||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||
consensus_call = 'filtered' | |||
strings[6] = 'filtered' | |||
DPCT = detected_percentage(strings[9:]) | |||
strings[7] = 'DPCT=' + DPCT | |||
elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')): | |||
consensus_call = 'pcr-free-speicifc' | |||
strings[6] = 'pcr-free-speicifc' | |||
DPCT = detected_percentage(strings[9:]) | |||
strings[7] = 'DPCT=' + DPCT | |||
elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')): | |||
consensus_call = 'pcr-speicifc' | |||
strings[6] = 'pcr-speicifc' | |||
DPCT = detected_percentage(strings[9:]) | |||
strings[7] = 'DPCT=' + DPCT | |||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||
consensus_call = 'confirm for parents' | |||
strings[6] = 'confirm for parents' | |||
DPCT = detected_percentage(strings[9:]) | |||
strings[7] = 'DPCT=' + DPCT | |||
else: | |||
consensus_call = 'filtered' | |||
strings[6] = 'filtered' | |||
DPCT = detected_percentage(strings[9:]) | |||
strings[7] = 'DPCT=' + DPCT | |||
# output | |||
outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n' | |||
outfile.write(outLine) | |||
if __name__ == '__main__': | |||
main() | |||
@@ -0,0 +1,109 @@ | |||
# import modules | |||
import numpy as np | |||
import pandas as pd | |||
from sklearn import svm | |||
from sklearn import preprocessing | |||
import sys, argparse, os | |||
from vcf2bed import position_to_bed,padding_region | |||
parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome") | |||
parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True) | |||
parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True) | |||
parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
train_input = args.trainDataset | |||
test_input = args.testDataset | |||
sample_name = args.sampleName | |||
# default columns, which will be included in the included in the calssifier | |||
chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY'] | |||
feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af'] | |||
feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af'] | |||
# import datasets sepearate the records with or without BaseQRankSum annotation, etc. | |||
def load_dat(dat_file_name): | |||
dat = pd.read_table(dat_file_name) | |||
dat['DB'] = dat['DB'].fillna(0) | |||
dat = dat[dat['DP'] != 0] | |||
dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP']) | |||
homo_rows = dat[dat['BaseQRankSum'].isnull()] | |||
heter_rows = dat[dat['BaseQRankSum'].notnull()] | |||
return homo_rows,heter_rows | |||
train_homo,train_heter = load_dat(train_input) | |||
test_homo,test_heter = load_dat(test_input) | |||
clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500) | |||
def prepare_dat(train_dat,test_dat,feature_cols,chromo): | |||
chr_train = train_dat[train_dat['chromo'] == chromo] | |||
chr_test = test_dat[test_dat['chromo'] == chromo] | |||
train_dat = chr_train.loc[:,feature_cols] | |||
test_dat = chr_test.loc[:,feature_cols] | |||
train_dat_scaled = preprocessing.scale(train_dat) | |||
test_dat_scaled = preprocessing.scale(test_dat) | |||
return chr_test,train_dat_scaled,test_dat_scaled | |||
def oneclass(X_train,X_test,chr_test): | |||
clf.fit(X_train) | |||
y_pred_test = clf.predict(X_test) | |||
test_true_dat = chr_test[y_pred_test == 1] | |||
test_false_dat = chr_test[y_pred_test == -1] | |||
return test_true_dat,test_false_dat | |||
predicted_true = pd.DataFrame(columns=train_homo.columns) | |||
predicted_false = pd.DataFrame(columns=train_homo.columns) | |||
for chromo in chromosome: | |||
# homo datasets | |||
chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo) | |||
test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo) | |||
predicted_true = predicted_true.append(test_true_homo) | |||
predicted_false = predicted_false.append(test_false_homo) | |||
# heter datasets | |||
chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo) | |||
test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter) | |||
predicted_true = predicted_true.append(test_true_heter) | |||
predicted_false = predicted_false.append(test_false_heter) | |||
predicted_true_filename = sample_name + '_predicted_true.txt' | |||
predicted_false_filename = sample_name + '_predicted_false.txt' | |||
predicted_true.to_csv(predicted_true_filename,sep='\t',index=False) | |||
predicted_false.to_csv(predicted_false_filename,sep='\t',index=False) | |||
# output the bed file and padding bed region 50bp | |||
predicted_true_bed_filename = sample_name + '_predicted_true.bed' | |||
predicted_false_bed_filename = sample_name + '_predicted_false.bed' | |||
padding_filename = sample_name + '_padding.bed' | |||
predicted_true_bed = open(predicted_true_bed_filename,'w') | |||
predicted_false_bed = open(predicted_false_bed_filename,'w') | |||
padding = open(padding_filename,'w') | |||
# | |||
for index,row in predicted_false.iterrows(): | |||
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) | |||
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' | |||
predicted_false_bed.write(outline_pos) | |||
chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50) | |||
outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n' | |||
outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n' | |||
padding.write(outline_pad_1) | |||
padding.write(outline_pad_2) | |||
for index,row in predicted_true.iterrows(): | |||
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) | |||
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' | |||
predicted_true_bed.write(outline_pos) | |||
@@ -0,0 +1,144 @@ | |||
# import modules | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style") | |||
parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True) | |||
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
inputFile = args.familyVCF | |||
family_name = args.familyName | |||
# output filename | |||
LCL5_name = family_name + '.LCL5.vcf' | |||
LCL5file = open(LCL5_name,'w') | |||
LCL6_name = family_name + '.LCL6.vcf' | |||
LCL6file = open(LCL6_name,'w') | |||
LCL7_name = family_name + '.LCL7.vcf' | |||
LCL7file = open(LCL7_name,'w') | |||
LCL8_name = family_name + '.LCL8.vcf' | |||
LCL8file = open(LCL8_name,'w') | |||
family_filename = family_name + '.vcf' | |||
familyfile = open(family_filename,'w') | |||
# default columns, which will be included in the included in the calssifier | |||
vcfheader = '''##fileformat=VCFv4.2 | |||
##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678"> | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent"> | |||
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
# write VCF | |||
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n' | |||
LCL5file.write(vcfheader) | |||
LCL5file.write(LCL5colname) | |||
LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n' | |||
LCL6file.write(vcfheader) | |||
LCL6file.write(LCL6colname) | |||
LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n' | |||
LCL7file.write(vcfheader) | |||
LCL7file.write(LCL7colname) | |||
LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n' | |||
LCL8file.write(vcfheader) | |||
LCL8file.write(LCL8colname) | |||
familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n' | |||
familyfile.write(vcfheader) | |||
familyfile.write(familycolname) | |||
# reform VCF | |||
def process(oneLine): | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
# replace . | |||
# LCL5 uniq | |||
if strings[11] == '.': | |||
strings[11] = '0/0' | |||
strings[9] = strings[12] | |||
strings[10] = strings[13] | |||
else: | |||
pass | |||
# LCL6 uniq | |||
if strings[14] == '.': | |||
strings[14] = '0/0' | |||
strings[12] = strings[9] | |||
strings[13] = strings[10] | |||
else: | |||
pass | |||
# sister | |||
if strings[11] == strings[14]: | |||
add_format = ":1" | |||
else: | |||
add_format = ":0" | |||
# trioLCL5 | |||
if strings[15] == 'MD=1': | |||
add_format = add_format + ":1" | |||
else: | |||
add_format = add_format + ":0" | |||
# trioLCL6 | |||
if strings[7] == 'MD=1': | |||
add_format = add_format + ":1" | |||
else: | |||
add_format = add_format + ":0" | |||
# filter | |||
if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'): | |||
strings[6] = 'PASS' | |||
else: | |||
strings[6] = '.' | |||
# output LCL5 | |||
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n' | |||
LCL5file.write(LCL5outLine) | |||
# output LCL6 | |||
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n' | |||
LCL6file.write(LCL6outLine) | |||
# output LCL7 | |||
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n' | |||
LCL7file.write(LCL7outLine) | |||
# output LCL8 | |||
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n' | |||
LCL8file.write(LCL8outLine) | |||
# output family | |||
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n' | |||
familyfile.write(familyoutLine) | |||
for line in fileinput.input(inputFile): | |||
m = re.match('^\#',line) | |||
if m is not None: | |||
pass | |||
else: | |||
process(line) | |||
@@ -0,0 +1,62 @@ | |||
import sys,getopt | |||
import os | |||
import re | |||
import fileinput | |||
def usage(): | |||
print( | |||
""" | |||
Usage: python select_small_variants_supported_by_all_callsets.py -i input_merged_vcf_file -o prefix | |||
This script selects SNPs and Indels supported by all callsets. | |||
Input: | |||
-i a merged vcf file | |||
Output: | |||
-o a vcf file containd the selected SNPs and Indels | |||
""") | |||
# select supported small variants | |||
def process(oneLine): | |||
m = re.match('^\#',oneLine) | |||
if m is not None: | |||
outVCF.write(oneLine) | |||
OUTname.write(oneLine) | |||
else: | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
gt = [i.split(':', 1)[0] for i in strings[9:len(strings)]] | |||
if all(e == gt[0] for e in gt) and (gt[0] != '.'): | |||
# output the record to vcf | |||
outVCF.write(oneLine) | |||
else: | |||
OUTname.write(oneLine) | |||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||
for op,value in opts: | |||
if op == "-i": | |||
inputFile=value | |||
elif op == "-o": | |||
prefix=value | |||
elif op == "-h": | |||
usage() | |||
sys.exit() | |||
if len(sys.argv[1:]) < 3: | |||
usage() | |||
sys.exit() | |||
VCFname = prefix + '.vcf' | |||
OUTname = prefix + '_outlier.vcf' | |||
outVCF = open(VCFname,'w') | |||
OUTname = open(OUTname,'w') | |||
for line in fileinput.input(inputFile): | |||
process(line) | |||
outVCF.close() | |||
OUTname.close() | |||
@@ -0,0 +1,36 @@ | |||
import re | |||
def position_to_bed(chromo,pos,ref,alt): | |||
# snv | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF | |||
if len(ref) == 1 and len(alt) == 1: | |||
StartPos = int(pos) -1 | |||
EndPos = int(pos) | |||
# deletions | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF + (reference length - alternate length) | |||
elif len(ref) > 1 and len(alt) == 1: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(ref) - 1) | |||
#insertions | |||
# For insertions: | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF + (alternate length - reference length) | |||
else: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(alt) - 1) | |||
return chromo,StartPos,EndPos | |||
def padding_region(chromo,pos1,pos2,padding): | |||
StartPos1 = pos1 - padding | |||
EndPos1 = pos1 | |||
StartPos2 = pos2 | |||
EndPos2 = pos2 + padding | |||
return chromo,StartPos1,EndPos1,StartPos2,EndPos2 |
@@ -0,0 +1 @@ | |||
#LCL5vcf LCL6vcf LCL7vcf LCL8vcf LCL5name LCL6name LCL7name LCL8name familyname |
@@ -0,0 +1,29 @@ | |||
{ | |||
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"{{ project_name }}.mergeSister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"{{ project_name }}.disk_size": "150", | |||
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | |||
"{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||
"{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc", | |||
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.LCL8variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | |||
} | |||
@@ -0,0 +1,23 @@ | |||
task VCFinfo { | |||
File merged_info | |||
String sample | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File extracted_info = "" | |||
} | |||
} |
@@ -0,0 +1,31 @@ | |||
task VCFrename { | |||
File trio_vcf_gz | |||
File trio_vcf_idx | |||
String mother_name | |||
String father_name | |||
String child_name | |||
String family_name | |||
String child | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
echo "MOTHER ${mother_name}.${child} | |||
FATHER ${father_name}.${child} | |||
CHILD ${child_name}" > rename.txt | |||
rtg vcfannotate -i ${trio_vcf_gz} -o ${family_name}.${child}.rename.vcf.gz --relabel=rename.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File rename_trio_vcf_gz = "${family_name}.${child}.rename.vcf.gz" | |||
File rename_trio_vcf_idx = "${family_name}.${child}.rename.vcf.gz.tbi" | |||
} | |||
} |
@@ -0,0 +1,30 @@ | |||
task indelNorm { | |||
File vcf | |||
File ref_dir | |||
String fasta | |||
String sampleName | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
cat ${vcf} | grep '#' > header | |||
cat ${vcf} | grep -v '#' > body | |||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered | |||
cat header body.filtered > ${sampleName}.filtered.vcf | |||
/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File normed_vcf = "${sampleName}.normed.vcf" | |||
} | |||
} |
@@ -0,0 +1,34 @@ | |||
task mendelian { | |||
File child_vcf | |||
File LCL7_vcf | |||
File LCL8_vcf | |||
String LCL7_name | |||
String LCL8_name | |||
String child_name | |||
File ref_dir | |||
String fasta | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
export LD_LIBRARY_PATH=/opt/htslib-1.9 | |||
nt=$(nproc) | |||
mkdir VBT | |||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${child_vcf} -outDir VBT -out-prefix ${child_name}.family --output-violation-regions -thread-count $nt | |||
cat VBT/${child_name}.family_trio.vcf > ${child_name}.family.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
Array[File] vbt_mendelian = glob("VBT/*") | |||
File trio_vcf = "${child_name}.family.vcf" | |||
} | |||
} |
@@ -0,0 +1,27 @@ | |||
task merge { | |||
Array[File] family_vcf_gz | |||
Array[File] family_vcf_idx | |||
String sample | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.vcf ${sep=" " family_vcf_gz} | |||
cat ${sample}.merged.vcf | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File merged_vcf = "${sample}.merged.vcf" | |||
File vcf_dup = "${sample}.vcf_dup.txt" | |||
} | |||
} |
@@ -0,0 +1,34 @@ | |||
task mergeSister { | |||
File LCL5_trio_vcf_gz | |||
File LCL5_trio_vcf_idx | |||
File LCL6_trio_vcf_gz | |||
File LCL6_trio_vcf_idx | |||
String family_name | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcfmerge -o LCL5.LCL6.merged.vcf.gz ${LCL5_trio_vcf_gz} ${LCL6_trio_vcf_gz} | |||
rtg vcfmerge -o LCL6.LCL5.merged.vcf.gz ${LCL6_trio_vcf_gz} ${LCL5_trio_vcf_gz} | |||
zcat LCL5.LCL6.merged.vcf.gz | grep '##' > header | |||
zcat LCL5.LCL6.merged.vcf.gz | grep -v '##' | cut -f8 > LCL5.mendelian | |||
zcat LCL6.LCL5.merged.vcf.gz | grep -v '##' | paste - LCL5.mendelian > body | |||
cat header body > ${family_name}.trio.info.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File family_mendelian_info = "${family_name}.trio.info.vcf" | |||
} | |||
} |
@@ -0,0 +1,24 @@ | |||
task mergeVCFInfo { | |||
Array[File] vcf_gz | |||
Array[File] vcf_idx | |||
String sample | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.info.vcf ${sep=" " vcf_gz} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File merged_info = "${sample}.merged.info.vcf" | |||
} | |||
} |
@@ -0,0 +1,39 @@ | |||
task oneClass { | |||
File snv_train_vcf | |||
File snv_test_vcf | |||
File indel_train_vcf | |||
File indel_test_vcf | |||
String sampleName = basename(snv_train_vcf,".normed.snv.train.txt") | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/oneClass.py -train ${snv_train_vcf} -test ${snv_test_vcf} -name ${sampleName}_snv | |||
python /opt/oneClass.py -train ${indel_train_vcf} -test ${indel_test_vcf} -name ${sampleName}_indel | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File snv_true_txt = "${sampleName}_snv_predicted_true.txt" | |||
File snv_false_txt = "${sampleName}_snv_predicted_false.txt" | |||
File snv_true_bed = "${sampleName}_snv_predicted_true.bed" | |||
File snv_false_bed = "${sampleName}_snv_predicted_false.bed" | |||
File snv_padding = "${sampleName}_snv_padding.bed" | |||
File indel_true_txt = "${sampleName}_indel_predicted_true.txt" | |||
File indel_false_txt = "${sampleName}_indel_predicted_false.txt" | |||
File indel_true_bed = "${sampleName}_indel_predicted_true.bed" | |||
File indel_false_bed = "${sampleName}_indel_predicted_false.bed" | |||
File indel_padding = "${sampleName}_indel_padding.bed" | |||
} | |||
} | |||
@@ -0,0 +1,30 @@ | |||
task reformVCF { | |||
File family_mendelian_info | |||
File family_name | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File LCL5_family_info = "${family_name}.LCL5.vcf" | |||
File LCL6_family_info = "${family_name}.LCL6.vcf" | |||
File LCL7_family_info = "${family_name}.LCL7.vcf" | |||
File LCL8_family_info = "${family_name}.LCL8.vcf" | |||
File family_info = "${family_name}.vcf" | |||
} | |||
} | |||
@@ -0,0 +1,35 @@ | |||
task sister { | |||
File LCL5_vcf | |||
File LCL6_vcf | |||
File ref_dir | |||
String fasta | |||
String family_name | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
export LD_LIBRARY_PATH=/opt/htslib-1.9 | |||
mkdir sister | |||
/opt/VBT-TrioAnalysis/vbt varcomp -called ${LCL5_vcf} -base ${LCL6_vcf} -ref ${ref_dir}/${fasta} -outDir sister -filter none | |||
mv sister/TPBase.vcf ${family_name}.sister.consistent.vcf | |||
mv sister/FP.vcf ${family_name}.LCL5.uniq.vcf | |||
mv sister/FN.vcf ${family_name}.LCL6.uniq.vcf | |||
mv sister/log.txt ${family_name}.sister.vbt.log.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File sister_consistent_vcf = "${family_name}.sister.consistent.vcf" | |||
File LCL5_uniq = "${family_name}.LCL5.uniq.vcf" | |||
File LCL6_uniq = "${family_name}.LCL6.uniq.vcf" | |||
File log = "${family_name}.sister.vbt.log.txt" | |||
} | |||
} |
@@ -0,0 +1,30 @@ | |||
task variantsNorm { | |||
File vcf | |||
File ref_dir | |||
String fasta | |||
String sampleName | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
cat ${vcf} | grep '#' > header | |||
cat ${vcf} | grep -v '#' > body | |||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered | |||
cat header body.filtered > ${sampleName}.filtered.vcf | |||
/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File normed_vcf = "${sampleName}.normed.vcf" | |||
} | |||
} |
@@ -0,0 +1,25 @@ | |||
task votes { | |||
File merged_vcf | |||
File vcf_dup | |||
String sample | |||
String prefix | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/high_confidence_call_vote.py -vcf ${merged_vcf} -dup ${vcf_dup} -sample ${sample} -prefix ${prefix} | |||
cat ${prefix}_annotated.vcf | cut -f1-9,45 | grep -v 'filtered' | grep -v 'confirm for parents' | grep -v 'pcr-free-speicifc' | grep -v 'pcr-speicifc' | grep -v 'dupVar' > ${prefix}_bechmarking_calls.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File annotated_vcf = "${prefix}_annotated.vcf" | |||
File benchmark_call = "${prefix}_bechmarking_calls.vcf" | |||
} | |||
} |
@@ -0,0 +1,25 @@ | |||
task zipIndex { | |||
File vcf | |||
String sample | |||
String family_name | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
rtg bgzip ${vcf} -c > ${family_name}.${sample}.vcf.gz | |||
rtg index -f vcf ${family_name}.${sample}.vcf.gz | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File vcf_gz = "${family_name}.${sample}.vcf.gz" | |||
File vcf_idx = "${family_name}.${sample}.vcf.gz.tbi" | |||
} | |||
} |
@@ -0,0 +1,207 @@ | |||
import "./tasks/variantsNorm.wdl" as variantsNorm | |||
import "./tasks/mendelian.wdl" as mendelian | |||
import "./tasks/zipIndex.wdl" as zipIndex | |||
import "./tasks/VCFrename.wdl" as VCFrename | |||
import "./tasks/mergeSister.wdl" as mergeSister | |||
import "./tasks/reformVCF.wdl" as reformVCF | |||
import "./tasks/merge.wdl" as merge | |||
workflow {{ project_name }} { | |||
File inputSamplesFile | |||
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | |||
File ref_dir | |||
String fasta | |||
String cluster_config | |||
String disk_size | |||
scatter (quartet in inputSamples){ | |||
call variantsNorm.variantsNorm as LCL5variantsNorm{ | |||
input: | |||
vcf=quartet[0], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sampleName=quartet[4], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call variantsNorm.variantsNorm as LCL6variantsNorm{ | |||
input: | |||
vcf=quartet[1], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sampleName=quartet[5], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call variantsNorm.variantsNorm as LCL7variantsNorm{ | |||
input: | |||
vcf=quartet[2], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sampleName=quartet[6], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call variantsNorm.variantsNorm as LCL8variantsNorm{ | |||
input: | |||
vcf=quartet[3], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sampleName=quartet[7], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call mendelian.mendelian as LCL5mendelian { | |||
input: | |||
child_vcf=LCL5variantsNorm.normed_vcf, | |||
LCL7_vcf=LCL7variantsNorm.normed_vcf, | |||
LCL8_vcf=LCL8variantsNorm.normed_vcf, | |||
LCL7_name=quartet[6], | |||
LCL8_name=quartet[7], | |||
child_name=quartet[4], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call mendelian.mendelian as LCL6mendelian { | |||
input: | |||
child_vcf=LCL6variantsNorm.normed_vcf, | |||
LCL7_vcf=LCL7variantsNorm.normed_vcf, | |||
LCL8_vcf=LCL8variantsNorm.normed_vcf, | |||
LCL7_name=quartet[6], | |||
LCL8_name=quartet[7], | |||
child_name=quartet[5], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL5zipIndex { | |||
input: | |||
vcf=LCL5mendelian.trio_vcf, | |||
sample="LCL5", | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL6zipIndex { | |||
input: | |||
vcf=LCL6mendelian.trio_vcf, | |||
sample="LCL6", | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call VCFrename.VCFrename as LCL5VCFrename { | |||
input: | |||
trio_vcf_gz=LCL5zipIndex.vcf_gz, | |||
trio_vcf_idx=LCL5zipIndex.vcf_idx, | |||
mother_name=quartet[7], | |||
father_name=quartet[6], | |||
child_name=quartet[4], | |||
family_name=quartet[8], | |||
child="LCL5", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call VCFrename.VCFrename as LCL6VCFrename { | |||
input: | |||
trio_vcf_gz=LCL6zipIndex.vcf_gz, | |||
trio_vcf_idx=LCL6zipIndex.vcf_idx, | |||
mother_name=quartet[7], | |||
father_name=quartet[6], | |||
child_name=quartet[5], | |||
family_name=quartet[8], | |||
child="LCL6", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call mergeSister.mergeSister as mergeSister { | |||
input: | |||
LCL5_trio_vcf_gz=LCL5VCFrename.rename_trio_vcf_gz, | |||
LCL5_trio_vcf_idx=LCL5VCFrename.rename_trio_vcf_idx, | |||
LCL6_trio_vcf_gz=LCL6VCFrename.rename_trio_vcf_gz, | |||
LCL6_trio_vcf_idx=LCL6VCFrename.rename_trio_vcf_idx, | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call reformVCF.reformVCF as reformVCF { | |||
input: | |||
family_mendelian_info=mergeSister.family_mendelian_info, | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL5familyzipIndex { | |||
input: | |||
vcf=reformVCF.LCL5_family_info, | |||
sample='LCL5', | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL6familyzipIndex { | |||
input: | |||
vcf=reformVCF.LCL6_family_info, | |||
sample='LCL6', | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL7familyzipIndex { | |||
input: | |||
vcf=reformVCF.LCL7_family_info, | |||
sample='LCL7', | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL8familyzipIndex { | |||
input: | |||
vcf=reformVCF.LCL8_family_info, | |||
sample='LCL8', | |||
family_name=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
} | |||
call merge.merge as LCL5merge { | |||
input: | |||
family_vcf_gz=LCL5familyzipIndex.vcf_gz, | |||
family_vcf_idx=LCL5familyzipIndex.vcf_idx, | |||
sample="LCL5", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge.merge as LCL6merge { | |||
input: | |||
family_vcf_gz=LCL6familyzipIndex.vcf_gz, | |||
family_vcf_idx=LCL6familyzipIndex.vcf_idx, | |||
sample="LCL6", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge.merge as LCL7merge { | |||
input: | |||
family_vcf_gz=LCL7familyzipIndex.vcf_gz, | |||
family_vcf_idx=LCL7familyzipIndex.vcf_idx, | |||
sample="LCL7", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge.merge as LCL8merge { | |||
input: | |||
family_vcf_gz=LCL8familyzipIndex.vcf_gz, | |||
family_vcf_idx=LCL8familyzipIndex.vcf_idx, | |||
sample="LCL8", | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
} | |||