소스 검색

first commit

master
LUYAO REN 5 년 전
커밋
e5602a0858
28개의 변경된 파일1489개의 추가작업 그리고 0개의 파일을 삭제
  1. +95
    -0
      README.md
  2. BIN
      codescripts/.DS_Store
  3. +68
    -0
      codescripts/bed_for_bamReadcount.py
  4. +96
    -0
      codescripts/extract_vcf_information.py
  5. +255
    -0
      codescripts/high_confidence_call_vote.py
  6. +109
    -0
      codescripts/oneClass.py
  7. +144
    -0
      codescripts/reformVCF.py
  8. +62
    -0
      codescripts/select_small_variants_supported_by_all_callsets.py
  9. +36
    -0
      codescripts/vcf2bed.py
  10. +1
    -0
      family.tsv
  11. +29
    -0
      inputs
  12. BIN
      pictures/.DS_Store
  13. BIN
      pictures/workflow.png
  14. BIN
      tasks/.DS_Store
  15. +23
    -0
      tasks/VCFinfo.wdl
  16. +31
    -0
      tasks/VCFrename.wdl
  17. +30
    -0
      tasks/indelNorm.wdl
  18. +34
    -0
      tasks/mendelian.wdl
  19. +27
    -0
      tasks/merge.wdl
  20. +34
    -0
      tasks/mergeSister.wdl
  21. +24
    -0
      tasks/mergeVCFInfo.wdl
  22. +39
    -0
      tasks/oneClass.wdl
  23. +30
    -0
      tasks/reformVCF.wdl
  24. +35
    -0
      tasks/sister.wdl
  25. +30
    -0
      tasks/variantsNorm.wdl
  26. +25
    -0
      tasks/votes.wdl
  27. +25
    -0
      tasks/zipIndex.wdl
  28. +207
    -0
      workflow.wdl

+ 95
- 0
README.md 파일 보기

@@ -0,0 +1,95 @@
#高置信突变位点的整合

> Author: Run Luyao
>
> E-mail:18110700050@fudan.edu.cn
>
> Git:
>
> Last Updates: 18/03/2020

## 安装指南

```bash
# 激活choppy环境
source activate choppy
# 安装app
choppy install renluyao/high_confidence_calls_manuscript
```

## App概述

中华家系1号全基因组高置信small variants(SNVs和Indels)的整合流程。



## 流程与参数

1. variantsNorm

保留chr1-22,X上的突变

用bcftools norm进行突变格式的统一

2. mendelian

LCL5、LCL7和LCL8为三口之家,进行trio-analysis,分析符合孟德尔和不符合孟德尔遗传规律的突变位点

LCL6、LCL7和LCL8为三口之家,进行trio-analysis,分析符合孟德尔和不符合孟德尔遗传规律的突变位点

得到LCL5和LCL6两个家系合并的vcf文件

3. zipIndex

对LCL5和LCL6两个家系合并的文件压缩和检索引

4. VCFrename

将VBT的输出结果,VCF文件中的MOTHER FATHER CHILD改成对应的样本名

5. mergeSister

将LCL5和LCL6修改过名字后的家系VCF文件合并

6. reformVCF

根据两个三口之家和姐妹的孟德尔遗传的信息,将之前和合并的VCF分解成4个人的vcf,并且包含了家系遗传的信息

7. familyzipIndex

将上一步输出的4个文件进行压缩和检索引

8. merge

将所有注释后的LCL5 vcf文件合并

将所有注释后的LCL6 vcf文件合并

将所有注释后的LCL7 vcf文件合并

将所有注释后的LCL8 vcf文件合并

9. vote

投票选择高置信的突变位点

t

## App输入变量与输入文件



## App输出文件



## 结果展示与解读



## CHANGELOG



## FAQ


BIN
codescripts/.DS_Store 파일 보기


+ 68
- 0
codescripts/bed_for_bamReadcount.py 파일 보기

@@ -0,0 +1,68 @@
import sys,getopt
import os
import re
import fileinput

def usage():
print(
"""
Usage: python bed_for_bamReadcount.py -i input_vcf_file -o prefix

This script selects SNPs and Indels supported by all callsets.

Please notice that bam-readcount only takes in 1-based coordinates.

Input:
-i a vcf file

Output:
-o a indel bed file for bam-readcount
""")

# select supported small variants
def process(oneLine):
m = re.match('^\#',oneLine)
if m is not None:
pass
else:
line = oneLine.rstrip()
strings = line.strip().split('\t')
# convert the position to bed file for bam-readcount
# deletion
if len(strings[3]) > 1 and len(strings[4]) == 1:
pos = int(strings[1]) + 1
outline = strings[0] + '\t' + str(pos) + '\t' + str(pos) + '\t' + strings[3] + '\t' + strings[4]+'\n'
outINDEL.write(outline)
# insertion
elif len(strings[3]) == 1 and len(strings[4]) > 1 and (',' not in strings[4]):
outline = strings[0] + '\t' + strings[1] + '\t' + strings[1] + '\t' + strings[3] + '\t' + strings[4] + '\n'
outINDEL.write(outline)
else:
outMNP.write(oneLine)

opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
for op,value in opts:
if op == "-i":
inputFile=value
elif op == "-o":
prefix=value
elif op == "-h":
usage()
sys.exit()

if len(sys.argv[1:]) < 3:
usage()
sys.exit()

INDELname = prefix + '.bed'
MNPname = prefix + '_MNP.txt'

outINDEL = open(INDELname,'w')
outMNP = open(MNPname,'w')

for line in fileinput.input(inputFile):
process(line)

outINDEL.close()
outMNP.close()


+ 96
- 0
codescripts/extract_vcf_information.py 파일 보기

@@ -0,0 +1,96 @@
import sys,getopt
import os
import re
import fileinput
import pandas as pd

def usage():
print(
"""
Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file

This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.

Input:
-i the selected vcf file

Output:
-o tab-delimited parsed file
""")

# select supported small variants
def process(oneLine):
line = oneLine.rstrip()
strings = line.strip().split('\t')
infoParsed = parse_INFO(strings[7])
formatKeys = strings[8].split(':')
formatValues = strings[9].split(':')
for i in range(0,len(formatKeys) -1) :
if formatKeys[i] == 'AD':
ra = formatValues[i].split(',')
infoParsed['RefDP'] = ra[0]
infoParsed['AltDP'] = ra[1]
if (int(ra[1]) + int(ra[0])) != 0:
infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
else:
pass
else:
infoParsed[formatKeys[i]] = formatValues[i]
infoParsed['chromo'] = strings[0]
infoParsed['pos'] = strings[1]
infoParsed['id'] = strings[2]
infoParsed['ref'] = strings[3]
infoParsed['alt'] = strings[4]
infoParsed['qual'] = strings[5]
return infoParsed


def parse_INFO(info):
strings = info.strip().split(';')
keys = []
values = []
for i in strings:
kv = i.split('=')
if kv[0] == 'DB':
keys.append('DB')
values.append('1')
elif kv[0] == 'AF':
pass
elif kv[0] == 'POSITIVE_TRAIN_SITE':
pass
elif kv[0] == 'NEGATIVE_TRAIN_SITE':
pass
else:
keys.append(kv[0])
values.append(kv[1])
infoDict = dict(zip(keys, values))
return infoDict

opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
for op,value in opts:
if op == "-i":
inputFile=value
elif op == "-o":
outputFile=value
elif op == "-h":
usage()
sys.exit()

if len(sys.argv[1:]) < 3:
usage()
sys.exit()

allDict = []
for line in fileinput.input(inputFile):
m = re.match('^\#',line)
if m is not None:
pass
else:
oneDict = process(line)
allDict.append(oneDict)

allTable = pd.DataFrame(allDict)

allTable.to_csv(outputFile,sep='\t',index=False)


+ 255
- 0
codescripts/high_confidence_call_vote.py 파일 보기

@@ -0,0 +1,255 @@
# import modules
import sys, argparse, os
import fileinput
import re
import pandas as pd
from operator import itemgetter
from collections import Counter
from itertools import islice

# input arguments
parser = argparse.ArgumentParser(description="this script is to count voting number")

parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
multi_sample_vcf = args.multi_sample_vcf
dup_list = args.dup_list
prefix = args.prefix
sample_name = args.sample_name

vcf_header = '''##fileformat=VCFv4.2
##fileDate=20191224
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes">
##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''

# read in duplication list
dup = pd.read_table(dup_list,header=None)
var_dup = dup[0].tolist()

# output file
file_name = prefix + '_annotated.vcf'
outfile = open(file_name,'w')

# write VCF
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n'
outfile.write(vcf_header)
outfile.write(outputcolumn)

#function

def detected_percentage(strings):
strings = [x.replace('0/0','.') for x in strings]
gt = [x.split(':')[0] for x in strings]
percentage = round((33 - gt.count('.'))/33,4)
return(str(percentage))

def vote_percentage(strings,consensus_call):
strings = [x.replace('.','0/0') for x in strings]
gt = [x.split(':')[0] for x in strings]
gt = list(map(gt_uniform,[i for i in gt]))
percentage = round(gt.count(consensus_call)/33,4)
return(str(percentage))

def family_vote(strings,consensus_call):
pass

def gt_uniform(strings):
uniformed_gt = ''
allele1 = strings.split('/')[0]
allele2 = strings.split('/')[1]
if int(allele1) > int(allele2):
uniformed_gt = allele2 + '/' + allele1
else:
uniformed_gt = allele1 + '/' + allele2
return uniformed_gt

def decide_by_rep(strings):
consensus_rep = ''
mendelian = [x[-5:] for x in strings]
strings = [x.replace('.','0/0') for x in strings]
gt = [x.split(':')[0] for x in strings]
# modified gt turn 2/1 to 1/2
gt = list(map(gt_uniform,[i for i in gt]))
# mendelian consistent?
mendelian_dict = Counter(mendelian)
highest_mendelian = mendelian_dict.most_common(1)
candidate_mendelian = highest_mendelian[0][0]
freq_mendelian = highest_mendelian[0][1]
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
gt_num_dict = Counter(gt)
highest_gt = gt_num_dict.most_common(1)
candidate_gt = highest_gt[0][0]
freq_gt = highest_gt[0][1]
if (candidate_gt != '0/0') and (freq_gt >= 2):
consensus_rep = candidate_gt
elif (candidate_gt == '0/0') and (freq_gt >= 2):
consensus_rep = '0/0'
else:
consensus_rep = 'inconGT'
elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
consensus_rep = 'noInfo'
else:
consensus_rep = 'inconMen'
return consensus_rep


def main():
for line in fileinput.input(multi_sample_vcf):
headline = re.match('^\#',line)
if headline is not None:
pass
else:
line = line.strip()
strings = line.split('\t')
variant_id = '_'.join([strings[0],strings[1]])
# check if the variants location is duplicated
if variant_id in var_dup:
strings[6] = 'dupVar'
outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n'
outfile.write(outLine)
else:
# pre-define
pcr_consensus = ''
pcr_free_consensus = ''
consensus_call = ''
consensus_alt_seq = '.'
# pcr
pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings)
SEQ2000 = decide_by_rep(pcr[0:3])
SEQ500 = decide_by_rep(pcr[4:7])
Nova = decide_by_rep(pcr[7:11])
XTen_ARD = decide_by_rep(pcr[11:14])
XTen_NVG = decide_by_rep(pcr[14:17])
XTen_WUX_1 = decide_by_rep(pcr[17:20])
XTen_WUX_2 = decide_by_rep(pcr[20:23])
sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2]
sequence_dict = Counter(sequence_site)
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 4:
pcr_consensus = candidate_sequence
else:
pcr_consensus = 'inconSequenceSite'
# pcr-free
pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings)
#SEQ2000 = decide_by_rep(pcr_free[0])
Nova_ARD_1 = decide_by_rep(pcr_free[1:4])
Nova_ARD_2 = decide_by_rep(pcr_free[4:7])
Nova_BRG = decide_by_rep(pcr_free[7:10])
sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 2:
pcr_free_consensus = candidate_sequence
else:
pcr_free_consensus = 'inconSequenceSite'
# pcr and pcr-free
tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
consensus_call = pcr_consensus
VPCT = vote_percentage(strings[9:],consensus_call)
strings[7] = 'VPCT=' + VPCT
DPCT = detected_percentage(strings[9:])
strings[7] = strings[7] + ';DPCT=' + DPCT
# Delete multiple alternative genotype to necessary expression
strings[6] = 'reproducible'
alt = strings[4]
alt_gt = alt.split(',')
if len(alt_gt) > 1:
allele1 = consensus_call.split('/')[0]
allele2 = consensus_call.split('/')[1]
if allele1 == '0':
allele2_seq = alt_gt[int(allele2) - 1]
consensus_alt_seq = allele2_seq
consensus_call = '0/1'
else:
allele1_seq = alt_gt[int(allele1) - 1]
allele2_seq = alt_gt[int(allele2) - 1]
if int(allele1) > int(allele2):
consensus_alt_seq = allele2_seq + ',' + allele1_seq
consensus_call = '1/2'
elif int(allele1) < int(allele2):
consensus_alt_seq = allele1_seq + ',' + allele2_seq
consensus_call = '1/2'
else:
consensus_alt_seq = allele1_seq
consensus_call = '1/1'
else:
consensus_alt_seq = alt
elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
consensus_call = 'filtered'
strings[6] = 'filtered'
DPCT = detected_percentage(strings[9:])
strings[7] = 'DPCT=' + DPCT
elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
consensus_call = 'pcr-free-speicifc'
strings[6] = 'pcr-free-speicifc'
DPCT = detected_percentage(strings[9:])
strings[7] = 'DPCT=' + DPCT
elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
consensus_call = 'pcr-speicifc'
strings[6] = 'pcr-speicifc'
DPCT = detected_percentage(strings[9:])
strings[7] = 'DPCT=' + DPCT
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
consensus_call = 'confirm for parents'
strings[6] = 'confirm for parents'
DPCT = detected_percentage(strings[9:])
strings[7] = 'DPCT=' + DPCT
else:
consensus_call = 'filtered'
strings[6] = 'filtered'
DPCT = detected_percentage(strings[9:])
strings[7] = 'DPCT=' + DPCT
# output
outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n'
outfile.write(outLine)


if __name__ == '__main__':
main()













+ 109
- 0
codescripts/oneClass.py 파일 보기

@@ -0,0 +1,109 @@
# import modules
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import preprocessing
import sys, argparse, os
from vcf2bed import position_to_bed,padding_region



parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")

parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True)
parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True)
parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True)

args = parser.parse_args()

# Rename input:
train_input = args.trainDataset
test_input = args.testDataset
sample_name = args.sampleName

# default columns, which will be included in the included in the calssifier
chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY']
feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']


# import datasets sepearate the records with or without BaseQRankSum annotation, etc.
def load_dat(dat_file_name):
dat = pd.read_table(dat_file_name)
dat['DB'] = dat['DB'].fillna(0)
dat = dat[dat['DP'] != 0]
dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
homo_rows = dat[dat['BaseQRankSum'].isnull()]
heter_rows = dat[dat['BaseQRankSum'].notnull()]
return homo_rows,heter_rows


train_homo,train_heter = load_dat(train_input)
test_homo,test_heter = load_dat(test_input)
clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500)

def prepare_dat(train_dat,test_dat,feature_cols,chromo):
chr_train = train_dat[train_dat['chromo'] == chromo]
chr_test = test_dat[test_dat['chromo'] == chromo]
train_dat = chr_train.loc[:,feature_cols]
test_dat = chr_test.loc[:,feature_cols]
train_dat_scaled = preprocessing.scale(train_dat)
test_dat_scaled = preprocessing.scale(test_dat)
return chr_test,train_dat_scaled,test_dat_scaled

def oneclass(X_train,X_test,chr_test):
clf.fit(X_train)
y_pred_test = clf.predict(X_test)
test_true_dat = chr_test[y_pred_test == 1]
test_false_dat = chr_test[y_pred_test == -1]
return test_true_dat,test_false_dat

predicted_true = pd.DataFrame(columns=train_homo.columns)
predicted_false = pd.DataFrame(columns=train_homo.columns)

for chromo in chromosome:
# homo datasets
chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
predicted_true = predicted_true.append(test_true_homo)
predicted_false = predicted_false.append(test_false_homo)
# heter datasets
chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
predicted_true = predicted_true.append(test_true_heter)
predicted_false = predicted_false.append(test_false_heter)

predicted_true_filename = sample_name + '_predicted_true.txt'
predicted_false_filename = sample_name + '_predicted_false.txt'

predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)

# output the bed file and padding bed region 50bp

predicted_true_bed_filename = sample_name + '_predicted_true.bed'
predicted_false_bed_filename = sample_name + '_predicted_false.bed'
padding_filename = sample_name + '_padding.bed'

predicted_true_bed = open(predicted_true_bed_filename,'w')
predicted_false_bed = open(predicted_false_bed_filename,'w')
padding = open(padding_filename,'w')

#
for index,row in predicted_false.iterrows():
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
predicted_false_bed.write(outline_pos)
chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
padding.write(outline_pad_1)
padding.write(outline_pad_2)

for index,row in predicted_true.iterrows():
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
predicted_true_bed.write(outline_pos)



+ 144
- 0
codescripts/reformVCF.py 파일 보기

@@ -0,0 +1,144 @@
# import modules
import sys, argparse, os
import fileinput
import re

parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style")

parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True)
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True)

args = parser.parse_args()

# Rename input:
inputFile = args.familyVCF
family_name = args.familyName

# output filename
LCL5_name = family_name + '.LCL5.vcf'
LCL5file = open(LCL5_name,'w')
LCL6_name = family_name + '.LCL6.vcf'
LCL6file = open(LCL6_name,'w')
LCL7_name = family_name + '.LCL7.vcf'
LCL7file = open(LCL7_name,'w')
LCL8_name = family_name + '.LCL8.vcf'
LCL8file = open(LCL8_name,'w')
family_filename = family_name + '.vcf'
familyfile = open(family_filename,'w')

# default columns, which will be included in the included in the calssifier
vcfheader = '''##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# write VCF
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n'
LCL5file.write(vcfheader)
LCL5file.write(LCL5colname)

LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n'
LCL6file.write(vcfheader)
LCL6file.write(LCL6colname)

LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n'
LCL7file.write(vcfheader)
LCL7file.write(LCL7colname)

LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n'
LCL8file.write(vcfheader)
LCL8file.write(LCL8colname)

familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n'
familyfile.write(vcfheader)
familyfile.write(familycolname)

# reform VCF
def process(oneLine):
line = oneLine.rstrip()
strings = line.strip().split('\t')
# replace .
# LCL5 uniq
if strings[11] == '.':
strings[11] = '0/0'
strings[9] = strings[12]
strings[10] = strings[13]
else:
pass
# LCL6 uniq
if strings[14] == '.':
strings[14] = '0/0'
strings[12] = strings[9]
strings[13] = strings[10]
else:
pass
# sister
if strings[11] == strings[14]:
add_format = ":1"
else:
add_format = ":0"
# trioLCL5
if strings[15] == 'MD=1':
add_format = add_format + ":1"
else:
add_format = add_format + ":0"
# trioLCL6
if strings[7] == 'MD=1':
add_format = add_format + ":1"
else:
add_format = add_format + ":0"
# filter
if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'):
strings[6] = 'PASS'
else:
strings[6] = '.'
# output LCL5
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n'
LCL5file.write(LCL5outLine)
# output LCL6
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n'
LCL6file.write(LCL6outLine)
# output LCL7
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n'
LCL7file.write(LCL7outLine)
# output LCL8
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n'
LCL8file.write(LCL8outLine)
# output family
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n'
familyfile.write(familyoutLine)


for line in fileinput.input(inputFile):
m = re.match('^\#',line)
if m is not None:
pass
else:
process(line)


+ 62
- 0
codescripts/select_small_variants_supported_by_all_callsets.py 파일 보기

@@ -0,0 +1,62 @@
import sys,getopt
import os
import re
import fileinput

def usage():
print(
"""
Usage: python select_small_variants_supported_by_all_callsets.py -i input_merged_vcf_file -o prefix

This script selects SNPs and Indels supported by all callsets.

Input:
-i a merged vcf file

Output:
-o a vcf file containd the selected SNPs and Indels
""")

# select supported small variants
def process(oneLine):
m = re.match('^\#',oneLine)
if m is not None:
outVCF.write(oneLine)
OUTname.write(oneLine)
else:
line = oneLine.rstrip()
strings = line.strip().split('\t')
gt = [i.split(':', 1)[0] for i in strings[9:len(strings)]]
if all(e == gt[0] for e in gt) and (gt[0] != '.'):
# output the record to vcf
outVCF.write(oneLine)
else:
OUTname.write(oneLine)


opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
for op,value in opts:
if op == "-i":
inputFile=value
elif op == "-o":
prefix=value
elif op == "-h":
usage()
sys.exit()

if len(sys.argv[1:]) < 3:
usage()
sys.exit()

VCFname = prefix + '.vcf'
OUTname = prefix + '_outlier.vcf'

outVCF = open(VCFname,'w')
OUTname = open(OUTname,'w')

for line in fileinput.input(inputFile):
process(line)

outVCF.close()
OUTname.close()


+ 36
- 0
codescripts/vcf2bed.py 파일 보기

@@ -0,0 +1,36 @@
import re

def position_to_bed(chromo,pos,ref,alt):
# snv
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF

if len(ref) == 1 and len(alt) == 1:
StartPos = int(pos) -1
EndPos = int(pos)
# deletions
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF + (reference length - alternate length)

elif len(ref) > 1 and len(alt) == 1:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(ref) - 1)
#insertions
# For insertions:
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF + (alternate length - reference length)

else:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(alt) - 1)

return chromo,StartPos,EndPos

def padding_region(chromo,pos1,pos2,padding):
StartPos1 = pos1 - padding
EndPos1 = pos1
StartPos2 = pos2
EndPos2 = pos2 + padding
return chromo,StartPos1,EndPos1,StartPos2,EndPos2

+ 1
- 0
family.tsv 파일 보기

@@ -0,0 +1 @@
#LCL5vcf LCL6vcf LCL7vcf LCL8vcf LCL5name LCL6name LCL7name LCL8name familyname

+ 29
- 0
inputs 파일 보기

@@ -0,0 +1,29 @@
{
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.mergeSister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.disk_size": "150",
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}",
"{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1",
"{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc",
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.LCL8variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/"
}



BIN
pictures/.DS_Store 파일 보기


BIN
pictures/workflow.png 파일 보기

Before After
Width: 2178  |  Height: 1290  |  Size: 180KB

BIN
tasks/.DS_Store 파일 보기


+ 23
- 0
tasks/VCFinfo.wdl 파일 보기

@@ -0,0 +1,23 @@
task VCFinfo {
File merged_info
String sample
String docker
String cluster_config
String disk_size
command <<<

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File extracted_info = ""
}
}

+ 31
- 0
tasks/VCFrename.wdl 파일 보기

@@ -0,0 +1,31 @@
task VCFrename {
File trio_vcf_gz
File trio_vcf_idx
String mother_name
String father_name
String child_name
String family_name
String child
String docker
String cluster_config
String disk_size
command <<<
echo "MOTHER ${mother_name}.${child}
FATHER ${father_name}.${child}
CHILD ${child_name}" > rename.txt

rtg vcfannotate -i ${trio_vcf_gz} -o ${family_name}.${child}.rename.vcf.gz --relabel=rename.txt
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File rename_trio_vcf_gz = "${family_name}.${child}.rename.vcf.gz"
File rename_trio_vcf_idx = "${family_name}.${child}.rename.vcf.gz.tbi"
}
}

+ 30
- 0
tasks/indelNorm.wdl 파일 보기

@@ -0,0 +1,30 @@
task indelNorm {
File vcf
File ref_dir
String fasta
String sampleName
String docker
String cluster_config
String disk_size
command <<<

cat ${vcf} | grep '#' > header
cat ${vcf} | grep -v '#' > body
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered
cat header body.filtered > ${sampleName}.filtered.vcf

/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File normed_vcf = "${sampleName}.normed.vcf"
}
}

+ 34
- 0
tasks/mendelian.wdl 파일 보기

@@ -0,0 +1,34 @@
task mendelian {
File child_vcf
File LCL7_vcf
File LCL8_vcf
String LCL7_name
String LCL8_name
String child_name
File ref_dir
String fasta
String docker
String cluster_config
String disk_size
command <<<
export LD_LIBRARY_PATH=/opt/htslib-1.9
nt=$(nproc)
mkdir VBT

/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${child_vcf} -outDir VBT -out-prefix ${child_name}.family --output-violation-regions -thread-count $nt

cat VBT/${child_name}.family_trio.vcf > ${child_name}.family.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
Array[File] vbt_mendelian = glob("VBT/*")
File trio_vcf = "${child_name}.family.vcf"
}
}

+ 27
- 0
tasks/merge.wdl 파일 보기

@@ -0,0 +1,27 @@
task merge {
Array[File] family_vcf_gz
Array[File] family_vcf_idx
String sample
String docker
String cluster_config
String disk_size
command <<<

rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.vcf ${sep=" " family_vcf_gz}

cat ${sample}.merged.vcf | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File merged_vcf = "${sample}.merged.vcf"
File vcf_dup = "${sample}.vcf_dup.txt"
}
}

+ 34
- 0
tasks/mergeSister.wdl 파일 보기

@@ -0,0 +1,34 @@
task mergeSister {
File LCL5_trio_vcf_gz
File LCL5_trio_vcf_idx
File LCL6_trio_vcf_gz
File LCL6_trio_vcf_idx
String family_name
String docker
String cluster_config
String disk_size
command <<<
rtg vcfmerge -o LCL5.LCL6.merged.vcf.gz ${LCL5_trio_vcf_gz} ${LCL6_trio_vcf_gz}

rtg vcfmerge -o LCL6.LCL5.merged.vcf.gz ${LCL6_trio_vcf_gz} ${LCL5_trio_vcf_gz}

zcat LCL5.LCL6.merged.vcf.gz | grep '##' > header
zcat LCL5.LCL6.merged.vcf.gz | grep -v '##' | cut -f8 > LCL5.mendelian
zcat LCL6.LCL5.merged.vcf.gz | grep -v '##' | paste - LCL5.mendelian > body

cat header body > ${family_name}.trio.info.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File family_mendelian_info = "${family_name}.trio.info.vcf"
}

}

+ 24
- 0
tasks/mergeVCFInfo.wdl 파일 보기

@@ -0,0 +1,24 @@
task mergeVCFInfo {
Array[File] vcf_gz
Array[File] vcf_idx
String sample
String docker
String cluster_config
String disk_size
command <<<

rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.info.vcf ${sep=" " vcf_gz}
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File merged_info = "${sample}.merged.info.vcf"
}
}

+ 39
- 0
tasks/oneClass.wdl 파일 보기

@@ -0,0 +1,39 @@
task oneClass {
File snv_train_vcf
File snv_test_vcf
File indel_train_vcf
File indel_test_vcf
String sampleName = basename(snv_train_vcf,".normed.snv.train.txt")
String docker
String cluster_config
String disk_size
command <<<

python /opt/oneClass.py -train ${snv_train_vcf} -test ${snv_test_vcf} -name ${sampleName}_snv

python /opt/oneClass.py -train ${indel_train_vcf} -test ${indel_test_vcf} -name ${sampleName}_indel

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File snv_true_txt = "${sampleName}_snv_predicted_true.txt"
File snv_false_txt = "${sampleName}_snv_predicted_false.txt"
File snv_true_bed = "${sampleName}_snv_predicted_true.bed"
File snv_false_bed = "${sampleName}_snv_predicted_false.bed"
File snv_padding = "${sampleName}_snv_padding.bed"
File indel_true_txt = "${sampleName}_indel_predicted_true.txt"
File indel_false_txt = "${sampleName}_indel_predicted_false.txt"
File indel_true_bed = "${sampleName}_indel_predicted_true.bed"
File indel_false_bed = "${sampleName}_indel_predicted_false.bed"
File indel_padding = "${sampleName}_indel_padding.bed"
}
}


+ 30
- 0
tasks/reformVCF.wdl 파일 보기

@@ -0,0 +1,30 @@
task reformVCF {
File family_mendelian_info
File family_name
String docker
String cluster_config
String disk_size
command <<<

python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name}

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File LCL5_family_info = "${family_name}.LCL5.vcf"
File LCL6_family_info = "${family_name}.LCL6.vcf"
File LCL7_family_info = "${family_name}.LCL7.vcf"
File LCL8_family_info = "${family_name}.LCL8.vcf"
File family_info = "${family_name}.vcf"
}
}


+ 35
- 0
tasks/sister.wdl 파일 보기

@@ -0,0 +1,35 @@
task sister {
File LCL5_vcf
File LCL6_vcf
File ref_dir
String fasta
String family_name
String docker
String cluster_config
String disk_size
command <<<
export LD_LIBRARY_PATH=/opt/htslib-1.9

mkdir sister
/opt/VBT-TrioAnalysis/vbt varcomp -called ${LCL5_vcf} -base ${LCL6_vcf} -ref ${ref_dir}/${fasta} -outDir sister -filter none

mv sister/TPBase.vcf ${family_name}.sister.consistent.vcf
mv sister/FP.vcf ${family_name}.LCL5.uniq.vcf
mv sister/FN.vcf ${family_name}.LCL6.uniq.vcf
mv sister/log.txt ${family_name}.sister.vbt.log.txt
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File sister_consistent_vcf = "${family_name}.sister.consistent.vcf"
File LCL5_uniq = "${family_name}.LCL5.uniq.vcf"
File LCL6_uniq = "${family_name}.LCL6.uniq.vcf"
File log = "${family_name}.sister.vbt.log.txt"
}
}

+ 30
- 0
tasks/variantsNorm.wdl 파일 보기

@@ -0,0 +1,30 @@
task variantsNorm {
File vcf
File ref_dir
String fasta
String sampleName
String docker
String cluster_config
String disk_size
command <<<

cat ${vcf} | grep '#' > header
cat ${vcf} | grep -v '#' > body
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered
cat header body.filtered > ${sampleName}.filtered.vcf

/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File normed_vcf = "${sampleName}.normed.vcf"
}
}

+ 25
- 0
tasks/votes.wdl 파일 보기

@@ -0,0 +1,25 @@
task votes {
File merged_vcf
File vcf_dup
String sample
String prefix
String docker
String cluster_config
String disk_size
command <<<
python /opt/high_confidence_call_vote.py -vcf ${merged_vcf} -dup ${vcf_dup} -sample ${sample} -prefix ${prefix}
cat ${prefix}_annotated.vcf | cut -f1-9,45 | grep -v 'filtered' | grep -v 'confirm for parents' | grep -v 'pcr-free-speicifc' | grep -v 'pcr-speicifc' | grep -v 'dupVar' > ${prefix}_bechmarking_calls.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File annotated_vcf = "${prefix}_annotated.vcf"
File benchmark_call = "${prefix}_bechmarking_calls.vcf"
}
}

+ 25
- 0
tasks/zipIndex.wdl 파일 보기

@@ -0,0 +1,25 @@
task zipIndex {
File vcf
String sample
String family_name
String docker
String cluster_config
String disk_size
command <<<
rtg bgzip ${vcf} -c > ${family_name}.${sample}.vcf.gz
rtg index -f vcf ${family_name}.${sample}.vcf.gz

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File vcf_gz = "${family_name}.${sample}.vcf.gz"
File vcf_idx = "${family_name}.${sample}.vcf.gz.tbi"
}
}

+ 207
- 0
workflow.wdl 파일 보기

@@ -0,0 +1,207 @@
import "./tasks/variantsNorm.wdl" as variantsNorm
import "./tasks/mendelian.wdl" as mendelian
import "./tasks/zipIndex.wdl" as zipIndex
import "./tasks/VCFrename.wdl" as VCFrename
import "./tasks/mergeSister.wdl" as mergeSister
import "./tasks/reformVCF.wdl" as reformVCF
import "./tasks/merge.wdl" as merge

workflow {{ project_name }} {
File inputSamplesFile
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile)
File ref_dir
String fasta
String cluster_config
String disk_size

scatter (quartet in inputSamples){
call variantsNorm.variantsNorm as LCL5variantsNorm{
input:
vcf=quartet[0],
ref_dir=ref_dir,
fasta=fasta,
sampleName=quartet[4],
cluster_config=cluster_config,
disk_size=disk_size
}
call variantsNorm.variantsNorm as LCL6variantsNorm{
input:
vcf=quartet[1],
ref_dir=ref_dir,
fasta=fasta,
sampleName=quartet[5],
cluster_config=cluster_config,
disk_size=disk_size
}
call variantsNorm.variantsNorm as LCL7variantsNorm{
input:
vcf=quartet[2],
ref_dir=ref_dir,
fasta=fasta,
sampleName=quartet[6],
cluster_config=cluster_config,
disk_size=disk_size
}
call variantsNorm.variantsNorm as LCL8variantsNorm{
input:
vcf=quartet[3],
ref_dir=ref_dir,
fasta=fasta,
sampleName=quartet[7],
cluster_config=cluster_config,
disk_size=disk_size
}
call mendelian.mendelian as LCL5mendelian {
input:
child_vcf=LCL5variantsNorm.normed_vcf,
LCL7_vcf=LCL7variantsNorm.normed_vcf,
LCL8_vcf=LCL8variantsNorm.normed_vcf,
LCL7_name=quartet[6],
LCL8_name=quartet[7],
child_name=quartet[4],
ref_dir=ref_dir,
fasta=fasta,
cluster_config=cluster_config,
disk_size=disk_size
}
call mendelian.mendelian as LCL6mendelian {
input:
child_vcf=LCL6variantsNorm.normed_vcf,
LCL7_vcf=LCL7variantsNorm.normed_vcf,
LCL8_vcf=LCL8variantsNorm.normed_vcf,
LCL7_name=quartet[6],
LCL8_name=quartet[7],
child_name=quartet[5],
ref_dir=ref_dir,
fasta=fasta,
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL5zipIndex {
input:
vcf=LCL5mendelian.trio_vcf,
sample="LCL5",
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL6zipIndex {
input:
vcf=LCL6mendelian.trio_vcf,
sample="LCL6",
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call VCFrename.VCFrename as LCL5VCFrename {
input:
trio_vcf_gz=LCL5zipIndex.vcf_gz,
trio_vcf_idx=LCL5zipIndex.vcf_idx,
mother_name=quartet[7],
father_name=quartet[6],
child_name=quartet[4],
family_name=quartet[8],
child="LCL5",
cluster_config=cluster_config,
disk_size=disk_size
}
call VCFrename.VCFrename as LCL6VCFrename {
input:
trio_vcf_gz=LCL6zipIndex.vcf_gz,
trio_vcf_idx=LCL6zipIndex.vcf_idx,
mother_name=quartet[7],
father_name=quartet[6],
child_name=quartet[5],
family_name=quartet[8],
child="LCL6",
cluster_config=cluster_config,
disk_size=disk_size
}
call mergeSister.mergeSister as mergeSister {
input:
LCL5_trio_vcf_gz=LCL5VCFrename.rename_trio_vcf_gz,
LCL5_trio_vcf_idx=LCL5VCFrename.rename_trio_vcf_idx,
LCL6_trio_vcf_gz=LCL6VCFrename.rename_trio_vcf_gz,
LCL6_trio_vcf_idx=LCL6VCFrename.rename_trio_vcf_idx,
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call reformVCF.reformVCF as reformVCF {
input:
family_mendelian_info=mergeSister.family_mendelian_info,
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL5familyzipIndex {
input:
vcf=reformVCF.LCL5_family_info,
sample='LCL5',
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL6familyzipIndex {
input:
vcf=reformVCF.LCL6_family_info,
sample='LCL6',
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL7familyzipIndex {
input:
vcf=reformVCF.LCL7_family_info,
sample='LCL7',
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL8familyzipIndex {
input:
vcf=reformVCF.LCL8_family_info,
sample='LCL8',
family_name=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
}
call merge.merge as LCL5merge {
input:
family_vcf_gz=LCL5familyzipIndex.vcf_gz,
family_vcf_idx=LCL5familyzipIndex.vcf_idx,
sample="LCL5",
cluster_config=cluster_config,
disk_size=disk_size
}
call merge.merge as LCL6merge {
input:
family_vcf_gz=LCL6familyzipIndex.vcf_gz,
family_vcf_idx=LCL6familyzipIndex.vcf_idx,
sample="LCL6",
cluster_config=cluster_config,
disk_size=disk_size
}
call merge.merge as LCL7merge {
input:
family_vcf_gz=LCL7familyzipIndex.vcf_gz,
family_vcf_idx=LCL7familyzipIndex.vcf_idx,
sample="LCL7",
cluster_config=cluster_config,
disk_size=disk_size
}
call merge.merge as LCL8merge {
input:
family_vcf_gz=LCL8familyzipIndex.vcf_gz,
family_vcf_idx=LCL8familyzipIndex.vcf_idx,
sample="LCL8",
cluster_config=cluster_config,
disk_size=disk_size
}
}


Loading…
취소
저장