> | > | ||||
> E-mail:18110700050@fudan.edu.cn | > E-mail:18110700050@fudan.edu.cn | ||||
> | > | ||||
> Git:http://choppy.3steps.cn/renluyao/high_confidence_calls_intergration.git | |||||
> Git:http://choppy.3steps.cn/renluyao/high_confidence_calls_manuscript.git | |||||
> | > | ||||
> Last Updates: 12/6/2019 | |||||
> Last Updates: 18/03/2020 | |||||
## 安装指南 | ## 安装指南 | ||||
# 激活choppy环境 | # 激活choppy环境 | ||||
source activate choppy | source activate choppy | ||||
# 安装app | # 安装app | ||||
choppy install renluyao/high_confidence_calls_intergration | |||||
choppy install renluyao/high_confidence_calls_manuscript | |||||
``` | ``` | ||||
## App概述 | ## App概述 | ||||
中华家系1号全基因组高置信small variants(SNVs和Indels)的整合流程。 | 中华家系1号全基因组高置信small variants(SNVs和Indels)的整合流程。 | ||||
Quartet家系的四个样本的DNA标准物质被送往多加测序公司,用PCRfree或者PCR的建库方法建立3个技术重复,用多个测序平台(Illumina XTen、Illumina Novaseq、BGISEQ-500、BGISEQ-2000和BGISEQ-T7)进行测序。获得的原始数据用多个分析流程进行分析(比对软件:BWA-mem、Novoalign、Bowtie2,突变检出软件:HaplotyperCaller、Strelka2、FreeBayes)。 | |||||
经过以上各种因素的组合每个样本得到216个VCF文件,本APP的目的是整合这些VCF文件的结果得到一组可信度较高的突变。 | |||||
## 流程与参数 | ## 流程与参数 | ||||
####1. variantsNorm | |||||
保留chr1-22,X上的突变 | |||||
用bcftools norm进行突变格式的统一 | |||||
####2. mendelian | |||||
LCL5、LCL7和LCL8为三口之家,进行trio-analysis,分析符合孟德尔和不符合孟德尔遗传规律的突变位点 | |||||
LCL6、LCL7和LCL8为三口之家,进行trio-analysis,分析符合孟德尔和不符合孟德尔遗传规律的突变位点 | |||||
得到LCL5和LCL6两个家系合并的vcf文件 | |||||
####3. zipIndex | |||||
对LCL5和LCL6两个家系合并的文件压缩和检索引 | |||||
####4. VCFrename | |||||
将VBT的输出结果,VCF文件中的MOTHER FATHER CHILD改成对应的样本名 | |||||
####5. mergeSister | |||||
将LCL5和LCL6修改过名字后的家系VCF文件合并 | |||||
####6. reformVCF | |||||
根据两个三口之家和姐妹的孟德尔遗传的信息,将之前和合并的VCF分解成4个人的vcf,并且包含了家系遗传的信息 | |||||
####7. familyzipIndex | |||||
将上一步输出的4个文件进行压缩和检索引 | |||||
####8. merge | |||||
将所有注释后的LCL5 vcf文件合并 | |||||
将所有注释后的LCL6 vcf文件合并 | |||||
将所有注释后的LCL7 vcf文件合并 | |||||
将所有注释后的LCL8 vcf文件合并 | |||||
####9. vote | |||||
投票选择高置信的突变位点 | |||||
t | |||||
## App输入变量与输入文件 | ## App输入变量与输入文件 | ||||
inputSamplesFile的格式如下 | |||||
```bash | |||||
#LCL5_VCF #LCL6_VCF #LCL7_VCF #LCL8_VCF #LCL5_sampleName #LCL6_sampleName #LCL7_sampleName #LCL8_sampleName #familyName | |||||
``` | |||||
最终版的整合文件包括: | |||||
## App输出文件 | ## App输出文件 |
import sys,getopt | |||||
import os | |||||
import re | |||||
import fileinput | |||||
def usage(): | |||||
print( | |||||
""" | |||||
Usage: python bed_for_bamReadcount.py -i input_vcf_file -o prefix | |||||
This script selects SNPs and Indels supported by all callsets. | |||||
Please notice that bam-readcount only takes in 1-based coordinates. | |||||
Input: | |||||
-i a vcf file | |||||
Output: | |||||
-o a indel bed file for bam-readcount | |||||
""") | |||||
# select supported small variants | |||||
def process(oneLine): | |||||
m = re.match('^\#',oneLine) | |||||
if m is not None: | |||||
pass | |||||
else: | |||||
line = oneLine.rstrip() | |||||
strings = line.strip().split('\t') | |||||
# convert the position to bed file for bam-readcount | |||||
# deletion | |||||
if len(strings[3]) > 1 and len(strings[4]) == 1: | |||||
pos = int(strings[1]) + 1 | |||||
outline = strings[0] + '\t' + str(pos) + '\t' + str(pos) + '\t' + strings[3] + '\t' + strings[4]+'\n' | |||||
outINDEL.write(outline) | |||||
# insertion | |||||
elif len(strings[3]) == 1 and len(strings[4]) > 1 and (',' not in strings[4]): | |||||
outline = strings[0] + '\t' + strings[1] + '\t' + strings[1] + '\t' + strings[3] + '\t' + strings[4] + '\n' | |||||
outINDEL.write(outline) | |||||
else: | |||||
outMNP.write(oneLine) | |||||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||||
for op,value in opts: | |||||
if op == "-i": | |||||
inputFile=value | |||||
elif op == "-o": | |||||
prefix=value | |||||
elif op == "-h": | |||||
usage() | |||||
sys.exit() | |||||
if len(sys.argv[1:]) < 3: | |||||
usage() | |||||
sys.exit() | |||||
INDELname = prefix + '.bed' | |||||
MNPname = prefix + '_MNP.txt' | |||||
outINDEL = open(INDELname,'w') | |||||
outMNP = open(MNPname,'w') | |||||
for line in fileinput.input(inputFile): | |||||
process(line) | |||||
outINDEL.close() | |||||
outMNP.close() | |||||
values.append('1') | values.append('1') | ||||
elif kv[0] == 'AF': | elif kv[0] == 'AF': | ||||
pass | pass | ||||
elif kv[0] == 'POSITIVE_TRAIN_SITE': | |||||
pass | |||||
elif kv[0] == 'NEGATIVE_TRAIN_SITE': | |||||
pass | |||||
else: | else: | ||||
keys.append(kv[0]) | keys.append(kv[0]) | ||||
values.append(kv[1]) | values.append(kv[1]) |
# import modules | |||||
from __future__ import division | |||||
import sys, argparse, os | import sys, argparse, os | ||||
import fileinput | import fileinput | ||||
import re | import re | ||||
import pandas as pd | import pandas as pd | ||||
from operator import itemgetter | from operator import itemgetter | ||||
from collections import Counter | from collections import Counter | ||||
from itertools import islice | |||||
from itertools import islice | |||||
from numpy import * | |||||
import statistics | |||||
# input arguments | # input arguments | ||||
parser = argparse.ArgumentParser(description="this script is to count voting number") | parser = argparse.ArgumentParser(description="this script is to count voting number") | ||||
sample_name = args.sample_name | sample_name = args.sample_name | ||||
vcf_header = '''##fileformat=VCFv4.2 | vcf_header = '''##fileformat=VCFv4.2 | ||||
##fileDate=20191224 | |||||
##fileDate=20200331 | |||||
##source=high_confidence_calls_intergration(choppy app) | ##source=high_confidence_calls_intergration(choppy app) | ||||
##reference=GRCh38.d1.vd1 | ##reference=GRCh38.d1.vd1 | ||||
##INFO=<ID=PCT,Number=1,Type=Float,Description="Percentage of votes"> | |||||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||||
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes"> | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes"> | |||||
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||||
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent"> | |||||
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples"> | |||||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency, sum alternative depth / sum depth"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Average genotype quality"> | |||||
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Average mapping quality"> | |||||
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||||
##FORMAT=<ID=QUALI,Number=1,Type=Float,Description="Average variant quality"> | |||||
##contig=<ID=chr1,length=248956422> | ##contig=<ID=chr1,length=248956422> | ||||
##contig=<ID=chr2,length=242193529> | ##contig=<ID=chr2,length=242193529> | ||||
##contig=<ID=chr3,length=198295559> | ##contig=<ID=chr3,length=198295559> | ||||
##contig=<ID=chrX,length=156040895> | ##contig=<ID=chrX,length=156040895> | ||||
''' | ''' | ||||
vcf_header_all_sample = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200331 | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||||
##INFO=<ID=DUP,Number=1,Type=Flag,Description="Duplicated variant records"> | |||||
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes"> | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes"> | |||||
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##INFO=<ID=ALL_ALT,Number=1,Type=Float,Description="Sum of alternative reads of all samples"> | |||||
##INFO=<ID=ALL_DP,Number=1,Type=Float,Description="Sum of depth of all samples"> | |||||
##INFO=<ID=ALL_AF,Number=1,Type=Float,Description="Allele frequency of net alternatice reads and net depth"> | |||||
##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of genotype quality of all samples"> | |||||
##INFO=<ID=QD_MEAN,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth"> | |||||
##INFO=<ID=MQ_MEAN,Number=1,Type=Float,Description="Mean of mapping quality of all samples"> | |||||
##INFO=<ID=FS_MEAN,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||||
##INFO=<ID=QUAL_MEAN,Number=1,Type=Float,Description="Average variant quality"> | |||||
##INFO=<ID=PCR,Number=1,Type=String,Description="Consensus of PCR votes"> | |||||
##INFO=<ID=PCR_FREE,Number=1,Type=String,Description="Consensus of PCR-free votes"> | |||||
##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Consensus calls"> | |||||
##INFO=<ID=CONSENSUS_SEQ,Number=1,Type=String,Description="Consensus sequence"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=DP,Number=1,Type=String,Description="Depth"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth"> | |||||
##FORMAT=<ID=AF,Number=1,Type=String,Description="Allele frequency"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=String,Description="Genotype quality"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=String,Description="Mapping quality"> | |||||
##FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 is twins shared, 0 is twins discordant "> | |||||
##FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL5 mendelian consistent, 0 is mendelian vioaltion"> | |||||
##FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL6 mendelian consistent, 0 is mendelian vioaltion"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# read in duplication list | # read in duplication list | ||||
dup = pd.read_table(dup_list,header=None) | dup = pd.read_table(dup_list,header=None) | ||||
var_dup = dup[0].tolist() | var_dup = dup[0].tolist() | ||||
# output file | # output file | ||||
file_name = prefix + '_annotated.vcf' | |||||
outfile = open(file_name,'w') | |||||
benchmark_file_name = prefix + '_voted.vcf' | |||||
benchmark_outfile = open(benchmark_file_name,'w') | |||||
all_sample_file_name = prefix + '_all_sample_information.vcf' | |||||
all_sample_outfile = open(all_sample_file_name,'w') | |||||
# write VCF | # write VCF | ||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'consensus' + '\n' | |||||
outfile.write(vcf_header) | |||||
outfile.write(outputcolumn) | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '_benchmark_calls\n' | |||||
benchmark_outfile.write(vcf_header) | |||||
benchmark_outfile.write(outputcolumn) | |||||
outputcolumn_all_sample = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+ \ | |||||
'Quartet_DNA_BGI_SEQ2000_BGI_1_20180518\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530\t' + \ | |||||
'Quartet_DNA_BGI_T7_WGE_1_20191105\tQuartet_DNA_BGI_T7_WGE_2_20191105\tQuartet_DNA_BGI_T7_WGE_3_20191105\t' + \ | |||||
'Quartet_DNA_ILM_Nova_ARD_1_20181108\tQuartet_DNA_ILM_Nova_ARD_2_20181108\tQuartet_DNA_ILM_Nova_ARD_3_20181108\t' + \ | |||||
'Quartet_DNA_ILM_Nova_ARD_4_20190111\tQuartet_DNA_ILM_Nova_ARD_5_20190111\tQuartet_DNA_ILM_Nova_ARD_6_20190111\t' + \ | |||||
'Quartet_DNA_ILM_Nova_BRG_1_20180930\tQuartet_DNA_ILM_Nova_BRG_2_20180930\tQuartet_DNA_ILM_Nova_BRG_3_20180930\t' + \ | |||||
'Quartet_DNA_ILM_Nova_WUX_1_20190917\tQuartet_DNA_ILM_Nova_WUX_2_20190917\tQuartet_DNA_ILM_Nova_WUX_3_20190917\t' + \ | |||||
'Quartet_DNA_ILM_XTen_ARD_1_20170403\tQuartet_DNA_ILM_XTen_ARD_2_20170403\tQuartet_DNA_ILM_XTen_ARD_3_20170403\t' + \ | |||||
'Quartet_DNA_ILM_XTen_NVG_1_20170329\tQuartet_DNA_ILM_XTen_NVG_2_20170329\tQuartet_DNA_ILM_XTen_NVG_3_20170329\t' + \ | |||||
'Quartet_DNA_ILM_XTen_WUX_1_20170216\tQuartet_DNA_ILM_XTen_WUX_2_20170216\tQuartet_DNA_ILM_XTen_WUX_3_20170216\n' | |||||
all_sample_outfile.write(vcf_header_all_sample) | |||||
all_sample_outfile.write(outputcolumn_all_sample) | |||||
#function | #function | ||||
def vote_percentage(strings): | |||||
strings = [x.replace('0/0','.') for x in strings] | |||||
def replace_nan(strings_list): | |||||
updated_list = [] | |||||
for i in strings_list: | |||||
if i == '.': | |||||
updated_list.append('.:.:.:.:.:.:.:.:.:.:.:.') | |||||
else: | |||||
updated_list.append(i) | |||||
return updated_list | |||||
def remove_dot(strings_list): | |||||
updated_list = [] | |||||
for i in strings_list: | |||||
if i == '.': | |||||
pass | |||||
else: | |||||
updated_list.append(i) | |||||
return updated_list | |||||
def detected_number(strings): | |||||
gt = [x.split(':')[0] for x in strings] | gt = [x.split(':')[0] for x in strings] | ||||
percentage = round((33 - gt.count('.'))/33,4) | |||||
percentage = 27 - gt.count('.') | |||||
return(str(percentage)) | return(str(percentage)) | ||||
def vote_number(strings,consensus_call): | |||||
gt = [x.split(':')[0] for x in strings] | |||||
gt = [x.replace('.','0/0') for x in gt] | |||||
gt = list(map(gt_uniform,[i for i in gt])) | |||||
vote_num = gt.count(consensus_call) | |||||
return(str(vote_num)) | |||||
def family_vote(strings,consensus_call): | |||||
gt = [x.split(':')[0] for x in strings] | |||||
gt = [x.replace('.','0/0') for x in gt] | |||||
gt = list(map(gt_uniform,[i for i in gt])) | |||||
mendelian = [':'.join(x.split(':')[1:4]) for x in strings] | |||||
indices = [i for i, x in enumerate(gt) if x == consensus_call] | |||||
matched_mendelian = itemgetter(*indices)(mendelian) | |||||
mendelian_num = matched_mendelian.count('1:1:1') | |||||
return(str(mendelian_num)) | |||||
def gt_uniform(strings): | |||||
uniformed_gt = '' | |||||
allele1 = strings.split('/')[0] | |||||
allele2 = strings.split('/')[1] | |||||
if int(allele1) > int(allele2): | |||||
uniformed_gt = allele2 + '/' + allele1 | |||||
else: | |||||
uniformed_gt = allele1 + '/' + allele2 | |||||
return uniformed_gt | |||||
def decide_by_rep(strings): | def decide_by_rep(strings): | ||||
consensus_rep = '' | consensus_rep = '' | ||||
mendelian = [x[-5:] for x in strings] | |||||
strings = [x.replace('.','0/0') for x in strings] | |||||
mendelian = [':'.join(x.split(':')[1:4]) for x in strings] | |||||
gt = [x.split(':')[0] for x in strings] | gt = [x.split(':')[0] for x in strings] | ||||
gt = [x.replace('.','0/0') for x in gt] | |||||
# modified gt turn 2/1 to 1/2 | |||||
gt = list(map(gt_uniform,[i for i in gt])) | |||||
# mendelian consistent? | # mendelian consistent? | ||||
mendelian_dict = Counter(mendelian) | mendelian_dict = Counter(mendelian) | ||||
highest_mendelian = mendelian_dict.most_common(1) | highest_mendelian = mendelian_dict.most_common(1) | ||||
consensus_rep = '0/0' | consensus_rep = '0/0' | ||||
else: | else: | ||||
consensus_rep = 'inconGT' | consensus_rep = 'inconGT' | ||||
elif (candidate_mendelian == '.') and (freq_mendelian >= 2): | |||||
elif (candidate_mendelian == '') and (freq_mendelian >= 2): | |||||
consensus_rep = 'noInfo' | consensus_rep = 'noInfo' | ||||
else: | else: | ||||
consensus_rep = 'inconMen' | consensus_rep = 'inconMen' | ||||
variant_id = '_'.join([strings[0],strings[1]]) | variant_id = '_'.join([strings[0],strings[1]]) | ||||
# check if the variants location is duplicated | # check if the variants location is duplicated | ||||
if variant_id in var_dup: | if variant_id in var_dup: | ||||
outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\n' | |||||
outfile.write(outLine) | |||||
strings[7] = strings[7] + ';DUP' | |||||
outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(outLine) | |||||
else: | else: | ||||
# pre-define | # pre-define | ||||
pcr_consensus = '' | |||||
pcr_free_consensus = '' | |||||
consensus_call = '' | |||||
pcr_consensus = '.' | |||||
pcr_free_consensus = '.' | |||||
consensus_call = '.' | |||||
consensus_alt_seq = '.' | |||||
# pcr | # pcr | ||||
pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings) | |||||
SEQ2000 = decide_by_rep(pcr[0:4]) | |||||
SEQ500 = decide_by_rep(pcr[4:7]) | |||||
Nova = decide_by_rep(pcr[7:11]) | |||||
XTen_ARD = decide_by_rep(pcr[11:14]) | |||||
XTen_NVG = decide_by_rep(pcr[14:17]) | |||||
XTen_WUX_1 = decide_by_rep(pcr[17:20]) | |||||
XTen_WUX_2 = decide_by_rep(pcr[20:23]) | |||||
sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2] | |||||
strings[9:] = replace_nan(strings[9:]) | |||||
pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings) | |||||
SEQ2000 = decide_by_rep(pcr[0:3]) | |||||
XTen_ARD = decide_by_rep(pcr[3:6]) | |||||
XTen_NVG = decide_by_rep(pcr[6:9]) | |||||
XTen_WUX = decide_by_rep(pcr[9:12]) | |||||
sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX] | |||||
sequence_dict = Counter(sequence_site) | sequence_dict = Counter(sequence_site) | ||||
highest_sequence = sequence_dict.most_common(1) | highest_sequence = sequence_dict.most_common(1) | ||||
candidate_sequence = highest_sequence[0][0] | candidate_sequence = highest_sequence[0][0] | ||||
freq_sequence = highest_sequence[0][1] | freq_sequence = highest_sequence[0][1] | ||||
if freq_sequence > 4: | |||||
if freq_sequence > 2: | |||||
pcr_consensus = candidate_sequence | pcr_consensus = candidate_sequence | ||||
else: | else: | ||||
pcr_consensus = 'inconSequenceSite' | pcr_consensus = 'inconSequenceSite' | ||||
# pcr-free | # pcr-free | ||||
pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings) | |||||
SEQ2000 = decide_by_rep(pcr_free[0]) | |||||
Nova_ARD_1 = decide_by_rep(pcr_free[1:4]) | |||||
Nova_ARD_2 = decide_by_rep(pcr_free[4:7]) | |||||
Nova_BRG = decide_by_rep(pcr_free[7:10]) | |||||
sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG] | |||||
pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings) | |||||
T7_WGE = decide_by_rep(pcr_free[0:3]) | |||||
Nova_ARD_1 = decide_by_rep(pcr_free[3:6]) | |||||
Nova_ARD_2 = decide_by_rep(pcr_free[6:9]) | |||||
Nova_BRG = decide_by_rep(pcr_free[9:12]) | |||||
Nova_WUX = decide_by_rep(pcr_free[12:15]) | |||||
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX] | |||||
highest_sequence = sequence_dict.most_common(1) | highest_sequence = sequence_dict.most_common(1) | ||||
candidate_sequence = highest_sequence[0][0] | candidate_sequence = highest_sequence[0][0] | ||||
freq_sequence = highest_sequence[0][1] | freq_sequence = highest_sequence[0][1] | ||||
if freq_sequence > 2: | |||||
if freq_sequence > 3: | |||||
pcr_free_consensus = candidate_sequence | pcr_free_consensus = candidate_sequence | ||||
else: | else: | ||||
pcr_free_consensus = 'inconSequenceSite' | pcr_free_consensus = 'inconSequenceSite' | ||||
tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] | tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] | ||||
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): | if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): | ||||
consensus_call = pcr_consensus | consensus_call = pcr_consensus | ||||
strings[6] = 'reproducible' | |||||
elif (pcr_consensus in tag) or (pcr_free_consensus in tag): | |||||
VOTED = vote_number(strings[9:],consensus_call) | |||||
strings[7] = strings[7] + ';VOTED=' + VOTED | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
FAM = family_vote(strings[9:],consensus_call) | |||||
strings[7] = strings[7] + ';FAM=' + FAM | |||||
# Delete multiple alternative genotype to necessary expression | |||||
alt = strings[4] | |||||
alt_gt = alt.split(',') | |||||
if len(alt_gt) > 1: | |||||
allele1 = consensus_call.split('/')[0] | |||||
allele2 = consensus_call.split('/')[1] | |||||
if allele1 == '0': | |||||
allele2_seq = alt_gt[int(allele2) - 1] | |||||
consensus_alt_seq = allele2_seq | |||||
consensus_call = '0/1' | |||||
else: | |||||
allele1_seq = alt_gt[int(allele1) - 1] | |||||
allele2_seq = alt_gt[int(allele2) - 1] | |||||
if int(allele1) > int(allele2): | |||||
consensus_alt_seq = allele2_seq + ',' + allele1_seq | |||||
consensus_call = '1/2' | |||||
elif int(allele1) < int(allele2): | |||||
consensus_alt_seq = allele1_seq + ',' + allele2_seq | |||||
consensus_call = '1/2' | |||||
else: | |||||
consensus_alt_seq = allele1_seq | |||||
consensus_call = '1/1' | |||||
else: | |||||
consensus_alt_seq = alt | |||||
# GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL | |||||
# GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT | |||||
# DP | |||||
DP = [x.split(':')[4] for x in strings[9:]] | |||||
DP = remove_dot(DP) | |||||
DP = [int(x) for x in DP] | |||||
ALL_DP = sum(DP) | |||||
# AF | |||||
ALT = [x.split(':')[5] for x in strings[9:]] | |||||
ALT = remove_dot(ALT) | |||||
ALT = [int(x) for x in ALT] | |||||
ALL_ALT = sum(ALT) | |||||
ALL_AF = round(ALL_ALT/ALL_DP,2) | |||||
# GQ | |||||
GQ = [x.split(':')[7] for x in strings[9:]] | |||||
GQ = remove_dot(GQ) | |||||
GQ = [int(x) for x in GQ] | |||||
GQ_MEAN = round(mean(GQ),2) | |||||
# QD | |||||
QD = [x.split(':')[8] for x in strings[9:]] | |||||
QD = remove_dot(QD) | |||||
QD = [float(x) for x in QD] | |||||
QD_MEAN = round(mean(QD),2) | |||||
# MQ | |||||
MQ = [x.split(':')[9] for x in strings[9:]] | |||||
MQ = remove_dot(MQ) | |||||
MQ = [float(x) for x in MQ] | |||||
MQ_MEAN = round(mean(MQ),2) | |||||
# FS | |||||
FS = [x.split(':')[10] for x in strings[9:]] | |||||
FS = remove_dot(FS) | |||||
FS = [float(x) for x in FS] | |||||
FS_MEAN = round(mean(FS),2) | |||||
# QUAL | |||||
QUAL = [x.split(':')[11] for x in strings[9:]] | |||||
QUAL = remove_dot(QUAL) | |||||
QUAL = [float(x) for x in QUAL] | |||||
QUAL_MEAN = round(mean(QUAL),2) | |||||
# benchmark output | |||||
output_format = consensus_call + ':' + str(ALL_DP) + ':' + str(ALL_ALT) + ':' + str(ALL_AF) + ':' + str(GQ_MEAN) + ':' + str(QD_MEAN) + ':' + str(MQ_MEAN) + ':' + str(FS_MEAN) + ':' + str(QUAL_MEAN) | |||||
outLine = strings[0] + '\t' + strings[1] + '\t' + strings[2] + '\t' + strings[3] + '\t' + consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' + strings[7] + '\t' + 'GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + output_format + '\n' | |||||
benchmark_outfile.write(outLine) | |||||
# all sample output | |||||
strings[7] = strings[7] + ';ALL_ALT=' + str(ALL_ALT) + ';ALL_DP=' + str(ALL_DP) + ';ALL_AF=' + str(ALL_AF) \ | |||||
+ ';GQ_MEAN=' + str(GQ_MEAN) + ';QD_MEAN=' + str(QD_MEAN) + ';MQ_MEAN=' + str(MQ_MEAN) + ';FS_MEAN=' + str(FS_MEAN) \ | |||||
+ ';QUAL_MEAN=' + str(QUAL_MEAN) + ';PCR=' + consensus_call + ';PCR_FREE=' + consensus_call + ';CONSENSUS=' + consensus_call \ | |||||
+ ';CONSENSUS_SEQ=' + consensus_alt_seq | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||||
consensus_call = 'filtered' | consensus_call = 'filtered' | ||||
strings[6] = '.' | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0'): | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')): | |||||
consensus_call = 'pcr-free-speicifc' | consensus_call = 'pcr-free-speicifc' | ||||
strings[6] = '.' | |||||
elif (pcr_consensus != '0/0') and (pcr_consensus not in tag) and (pcr_free_consensus == '0/0'): | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')): | |||||
consensus_call = 'pcr-speicifc' | consensus_call = 'pcr-speicifc' | ||||
strings[6] = '.' | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call + ';PCR=' + pcr_consensus + ';PCR_FREE=' + pcr_free_consensus | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | ||||
consensus_call = 'confirm for parents' | |||||
strings[6] = '.' | |||||
consensus_call = 'confirm for parents' | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
else: | else: | ||||
consensus_call = 'filtered' | consensus_call = 'filtered' | ||||
strings[6] = '.' | |||||
# percentage | |||||
percentage = vote_percentage(strings[9:]) | |||||
strings[7] = 'PCT=' + percentage | |||||
# output | |||||
outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\n' | |||||
outfile.write(outLine) | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
main() | main() |
from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates") | |||||
parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True) | |||||
parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True) | |||||
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) | |||||
args = parser.parse_args() | |||||
vcfInfo = args.vcfInfo | |||||
mendelianInfo = args.mendelianInfo | |||||
sample_name = args.sample_name | |||||
#GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL | |||||
vcf_header = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200331 | |||||
##source=high_confidence_calls_intergration(choppy app) | |||||
##reference=GRCh38.d1.vd1 | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=TWINS,Number=1,Type=Flag,Description="1 for sister consistent, 0 for sister different"> | |||||
##FORMAT=<ID=TRIO5,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation"> | |||||
##FORMAT=<ID=TRIO6,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth"> | |||||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality"> | |||||
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality"> | |||||
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||||
##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="variant quality"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# output file | |||||
file_name = sample_name + '_mendelian_vcfInfo.vcf' | |||||
outfile = open(file_name,'w') | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '\n' | |||||
outfile.write(vcf_header) | |||||
outfile.write(outputcolumn) | |||||
# input files | |||||
vcf_info = pd.read_table(vcfInfo) | |||||
mendelian_info = pd.read_table(mendelianInfo) | |||||
merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_df = merged_df.fillna('.') | |||||
# | |||||
def parse_INFO(info): | |||||
strings = info.strip().split(';') | |||||
keys = [] | |||||
values = [] | |||||
for i in strings: | |||||
kv = i.split('=') | |||||
if kv[0] == 'DB': | |||||
keys.append('DB') | |||||
values.append('1') | |||||
else: | |||||
keys.append(kv[0]) | |||||
values.append(kv[1]) | |||||
infoDict = dict(zip(keys, values)) | |||||
return infoDict | |||||
# | |||||
for row in merged_df.itertuples(): | |||||
if row[18] != '.': | |||||
# format | |||||
# GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL | |||||
FORMAT_x = row[10].split(':') | |||||
ALT = int(FORMAT_x[1].split(',')[1]) | |||||
if int(FORMAT_x[2]) != 0: | |||||
AF = round(ALT/int(FORMAT_x[2]),2) | |||||
else: | |||||
AF = '.' | |||||
INFO_x = parse_INFO(row.INFO_x) | |||||
if FORMAT_x[2] == '0': | |||||
INFO_x['QD'] = '.' | |||||
else: | |||||
pass | |||||
FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) | |||||
# outline | |||||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n' | |||||
else: | |||||
rawGT = row[10].split(':') | |||||
FORMAT_x = row[10].split(':') | |||||
ALT = int(FORMAT_x[1].split(',')[1]) | |||||
if int(FORMAT_x[2]) != 0: | |||||
AF = round(ALT/int(FORMAT_x[2]),2) | |||||
else: | |||||
AF = '.' | |||||
INFO_x = parse_INFO(row.INFO_x) | |||||
if FORMAT_x[2] == '0': | |||||
INFO_x['QD'] = '.' | |||||
else: | |||||
pass | |||||
FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0] | |||||
# outline | |||||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n' | |||||
outfile.write(outline) |
from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information") | |||||
parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True) | |||||
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True) | |||||
parser.add_argument('-family', '--family', type=str, help='family name', required=True) | |||||
args = parser.parse_args() | |||||
lcl5 = args.LCL5 | |||||
lcl6 = args.LCL6 | |||||
family = args.family | |||||
# output file | |||||
family_name = family + '.txt' | |||||
family_file = open(family_name,'w') | |||||
# input files | |||||
lcl5_dat = pd.read_table(lcl5) | |||||
lcl6_dat = pd.read_table(lcl6) | |||||
merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
def alt_seq(alt, genotype): | |||||
if genotype == './.': | |||||
seq = './.' | |||||
elif genotype == '0/0': | |||||
seq = '0/0' | |||||
else: | |||||
alt = alt.split(',') | |||||
genotype = genotype.split('/') | |||||
if genotype[0] == '0': | |||||
allele2 = alt[int(genotype[1]) - 1] | |||||
seq = '0/' + allele2 | |||||
else: | |||||
allele1 = alt[int(genotype[0]) - 1] | |||||
allele2 = alt[int(genotype[1]) - 1] | |||||
seq = allele1 + '/' + allele2 | |||||
return seq | |||||
for row in merged_df.itertuples(): | |||||
# correction of multiallele | |||||
if pd.isnull(row.INFO_x) == True or pd.isnull(row.INFO_y) == True: | |||||
mendelian = '.' | |||||
else: | |||||
lcl5_seq = alt_seq(row.ALT_x, row.CHILD_x) | |||||
lcl6_seq = alt_seq(row.ALT_y, row.CHILD_y) | |||||
if lcl5_seq == lcl6_seq: | |||||
mendelian = '1' | |||||
else: | |||||
mendelian = '0' | |||||
if pd.isnull(row.INFO_x) == True: | |||||
mendelian = mendelian + ':.' | |||||
else: | |||||
mendelian = mendelian + ':' + row.INFO_x.split('=')[1] | |||||
if pd.isnull(row.INFO_y) == True: | |||||
mendelian = mendelian + ':.' | |||||
else: | |||||
mendelian = mendelian + ':' + row.INFO_y.split('=')[1] | |||||
outline = row._1 + '\t' + str(row.POS) + '\t' + mendelian + '\n' | |||||
family_file.write(outline) |
line = oneLine.rstrip() | line = oneLine.rstrip() | ||||
strings = line.strip().split('\t') | strings = line.strip().split('\t') | ||||
# replace . | # replace . | ||||
# LCL5 uniq | |||||
# LCL6 uniq | |||||
if strings[11] == '.': | if strings[11] == '.': | ||||
strings[11] = '0/0' | strings[11] = '0/0' | ||||
strings[9] = strings[12] | strings[9] = strings[12] | ||||
strings[10] = strings[13] | strings[10] = strings[13] | ||||
else: | else: | ||||
pass | pass | ||||
# LCL6 uniq | |||||
# LCL5 uniq | |||||
if strings[14] == '.': | if strings[14] == '.': | ||||
strings[14] = '0/0' | strings[14] = '0/0' | ||||
strings[12] = strings[9] | strings[12] = strings[9] |
import sys,getopt | |||||
import os | |||||
import re | |||||
import fileinput | |||||
def usage(): | |||||
print( | |||||
""" | |||||
Usage: python select_small_variants_supported_by_all_callsets.py -i input_merged_vcf_file -o prefix | |||||
This script selects SNPs and Indels supported by all callsets. | |||||
Input: | |||||
-i a merged vcf file | |||||
Output: | |||||
-o a vcf file containd the selected SNPs and Indels | |||||
""") | |||||
# select supported small variants | |||||
def process(oneLine): | |||||
m = re.match('^\#',oneLine) | |||||
if m is not None: | |||||
outVCF.write(oneLine) | |||||
OUTname.write(oneLine) | |||||
else: | |||||
line = oneLine.rstrip() | |||||
strings = line.strip().split('\t') | |||||
gt = [i.split(':', 1)[0] for i in strings[9:len(strings)]] | |||||
if all(e == gt[0] for e in gt) and (gt[0] != '.'): | |||||
# output the record to vcf | |||||
outVCF.write(oneLine) | |||||
else: | |||||
OUTname.write(oneLine) | |||||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||||
for op,value in opts: | |||||
if op == "-i": | |||||
inputFile=value | |||||
elif op == "-o": | |||||
prefix=value | |||||
elif op == "-h": | |||||
usage() | |||||
sys.exit() | |||||
if len(sys.argv[1:]) < 3: | |||||
usage() | |||||
sys.exit() | |||||
VCFname = prefix + '.vcf' | |||||
OUTname = prefix + '_outlier.vcf' | |||||
outVCF = open(VCFname,'w') | |||||
OUTname = open(OUTname,'w') | |||||
for line in fileinput.input(inputFile): | |||||
process(line) | |||||
outVCF.close() | |||||
OUTname.close() | |||||
from __future__ import division | |||||
from glob import glob | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import pandas as pd | |||||
from operator import itemgetter | |||||
from collections import Counter | |||||
from itertools import islice | |||||
from numpy import * | |||||
import statistics | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls") | |||||
parser.add_argument('-folder', '--folder', type=str, help='directory that holds all the mendelian info', required=True) | |||||
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True) | |||||
args = parser.parse_args() | |||||
folder = args.folder | |||||
vcf = args.vcf | |||||
# input files | |||||
folder = folder + '/*.txt' | |||||
filenames = glob(folder) | |||||
dataframes = [] | |||||
for filename in filenames: | |||||
dataframes.append(pd.read_table(filename,header=None)) | |||||
dfs = [df.set_index([0, 1]) for df in dataframes] | |||||
merged_mendelian = pd.concat(dfs, axis=1).reset_index() | |||||
family_name = [i.split('/')[-1].replace('.txt','') for i in filenames] | |||||
columns = ['CHROM','POS'] + family_name | |||||
merged_mendelian.columns = columns | |||||
vcf_dat = pd.read_table(vcf) | |||||
merged_df = pd.merge(merged_mendelian, vcf_dat, how='outer', left_on=['CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_df = merged_df.fillna('nan') | |||||
vcf_header = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200501 | |||||
##source=high_confidence_calls_intergration(choppy app) | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# output files | |||||
benchmark_LCL5 = open('LCL5_voted.vcf','w') | |||||
benchmark_LCL6 = open('LCL6_voted.vcf','w') | |||||
benchmark_LCL7 = open('LCL7_voted.vcf','w') | |||||
benchmark_LCL8 = open('LCL8_voted.vcf','w') | |||||
all_sample_outfile = open('all_sample_information.txt','w') | |||||
# write VCF | |||||
LCL5_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_benchmark_calls\n' | |||||
LCL6_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6_benchmark_calls\n' | |||||
LCL7_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7_benchmark_calls\n' | |||||
LCL8_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8_benchmark_calls\n' | |||||
benchmark_LCL5.write(vcf_header) | |||||
benchmark_LCL5.write(LCL5_col) | |||||
benchmark_LCL6.write(vcf_header) | |||||
benchmark_LCL6.write(LCL6_col) | |||||
benchmark_LCL7.write(vcf_header) | |||||
benchmark_LCL7.write(LCL7_col) | |||||
benchmark_LCL8.write(vcf_header) | |||||
benchmark_LCL8.write(LCL8_col) | |||||
# all info | |||||
all_info_col = 'CHROM\tPOS\tLCL5_pcr_consensus\tLCL5_pcr_free_consensus\tLCL5_mendelian_num\tLCL5_consensus_call\tLCL5_consensus_alt_seq\tLCL5_alt\tLCL5_dp\tLCL5_detected_num\tLCL6_pcr_consensus\tLCL6_pcr_free_consensus\tLCL6_mendelian_num\tLCL6_consensus_call\tLCL6_consensus_alt_seq\tLCL6_alt\tLCL6_dp\tLCL6_detected_num\tLCL7_pcr_consensus\tLCL7_pcr_free_consensus\tLCL7_mendelian_num\t LCL7_consensus_call\tLCL7_consensus_alt_seq\tLCL7_alt\tLCL7_dp\tLCL7_detected_num\tLCL8_pcr_consensus\tLCL8_pcr_free_consensus\tLCL8_mendelian_num\tLCL8_consensus_call\tLCL8_consensus_alt_seq\tLCL8_alt\tLCL8_dp\tLCL8_detected_num\n' | |||||
all_sample_outfile.write(all_info_col) | |||||
# function | |||||
def decide_by_rep(vcf_list,mendelian_list): | |||||
consensus_rep = '' | |||||
gt = [x.split(':')[0] for x in vcf_list] | |||||
# mendelian consistent? | |||||
mendelian_dict = Counter(mendelian_list) | |||||
highest_mendelian = mendelian_dict.most_common(1) | |||||
candidate_mendelian = highest_mendelian[0][0] | |||||
freq_mendelian = highest_mendelian[0][1] | |||||
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2): | |||||
con_loc = [i for i in range(len(mendelian_list)) if mendelian_list[i] == '1:1:1'] | |||||
gt_con = itemgetter(*con_loc)(gt) | |||||
gt_num_dict = Counter(gt_con) | |||||
highest_gt = gt_num_dict.most_common(1) | |||||
candidate_gt = highest_gt[0][0] | |||||
freq_gt = highest_gt[0][1] | |||||
if (candidate_gt != './.') and (freq_gt >= 2): | |||||
consensus_rep = candidate_gt | |||||
elif (candidate_gt == './.') and (freq_gt >= 2): | |||||
consensus_rep = 'noGTInfo' | |||||
else: | |||||
consensus_rep = 'inconGT' | |||||
elif (candidate_mendelian == 'nan') and (freq_mendelian >= 2): | |||||
consensus_rep = 'noMenInfo' | |||||
else: | |||||
consensus_rep = 'inconMen' | |||||
return consensus_rep | |||||
def consensus_call(vcf_info_list,mendelian_list,alt_seq): | |||||
pcr_consensus = '.' | |||||
pcr_free_consensus = '.' | |||||
mendelian_num = '.' | |||||
consensus_call = '.' | |||||
consensus_alt_seq = '.' | |||||
# pcr | |||||
SEQ2000 = decide_by_rep(vcf_info_list[0:3],mendelian_list[0:3]) | |||||
XTen_ARD = decide_by_rep(vcf_info_list[18:21],mendelian_list[18:21]) | |||||
XTen_NVG = decide_by_rep(vcf_info_list[21:24],mendelian_list[21:24]) | |||||
XTen_WUX = decide_by_rep(vcf_info_list[24:27],mendelian_list[24:27]) | |||||
pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX] | |||||
pcr_sequence_dict = Counter(pcr_sequence_site) | |||||
pcr_highest_sequence = pcr_sequence_dict.most_common(1) | |||||
pcr_candidate_sequence = pcr_highest_sequence[0][0] | |||||
pcr_freq_sequence = pcr_highest_sequence[0][1] | |||||
if pcr_freq_sequence > 2: | |||||
pcr_consensus = pcr_candidate_sequence | |||||
else: | |||||
pcr_consensus = 'inconSequenceSite' | |||||
# pcr-free | |||||
T7_WGE = decide_by_rep(vcf_info_list[3:6],mendelian_list[3:6]) | |||||
Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9],mendelian_list[6:9]) | |||||
Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12],mendelian_list[9:12]) | |||||
Nova_BRG = decide_by_rep(vcf_info_list[12:15],mendelian_list[12:15]) | |||||
Nova_WUX = decide_by_rep(vcf_info_list[15:18],mendelian_list[15:18]) | |||||
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX] | |||||
sequence_dict = Counter(sequence_site) | |||||
highest_sequence = sequence_dict.most_common(1) | |||||
candidate_sequence = highest_sequence[0][0] | |||||
freq_sequence = highest_sequence[0][1] | |||||
if freq_sequence > 3: | |||||
pcr_free_consensus = candidate_sequence | |||||
else: | |||||
pcr_free_consensus = 'inconSequenceSite' | |||||
# net alt, dp | |||||
# alt | |||||
AD = [x.split(':')[1] for x in vcf_info_list] | |||||
ALT = [x.split(',')[1] for x in AD] | |||||
ALT = [int(x) for x in ALT] | |||||
ALL_ALT = sum(ALT) | |||||
# dp | |||||
DP = [x.split(':')[2] for x in vcf_info_list] | |||||
DP = [int(x) for x in DP] | |||||
ALL_DP = sum(DP) | |||||
# detected number | |||||
gt = [x.split(':')[0] for x in vcf_info_list] | |||||
gt = [x.replace('0/0','.') for x in gt] | |||||
gt = [x.replace('./.','.') for x in gt] | |||||
detected_num = 27 - gt.count('.') | |||||
# decide consensus calls | |||||
tag = ['inconGT','noMenInfo','inconMen','inconSequenceSite','noGTInfo'] | |||||
if (pcr_consensus != '0/0') and (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag): | |||||
consensus_call = pcr_consensus | |||||
gt = [x.split(':')[0] for x in vcf_info_list] | |||||
indices = [i for i, x in enumerate(gt) if x == consensus_call] | |||||
matched_mendelian = itemgetter(*indices)(mendelian_list) | |||||
mendelian_num = matched_mendelian.count('1:1:1') | |||||
# Delete multiple alternative genotype to necessary expression | |||||
alt_gt = alt_seq.split(',') | |||||
if len(alt_gt) > 1: | |||||
allele1 = consensus_call.split('/')[0] | |||||
allele2 = consensus_call.split('/')[1] | |||||
if allele1 == '0': | |||||
allele2_seq = alt_gt[int(allele2) - 1] | |||||
consensus_alt_seq = allele2_seq | |||||
consensus_call = '0/1' | |||||
else: | |||||
allele1_seq = alt_gt[int(allele1) - 1] | |||||
allele2_seq = alt_gt[int(allele2) - 1] | |||||
if int(allele1) > int(allele2): | |||||
consensus_alt_seq = allele2_seq + ',' + allele1_seq | |||||
consensus_call = '1/2' | |||||
elif int(allele1) < int(allele2): | |||||
consensus_alt_seq = allele1_seq + ',' + allele2_seq | |||||
consensus_call = '1/2' | |||||
else: | |||||
consensus_alt_seq = allele1_seq | |||||
consensus_call = '1/1' | |||||
else: | |||||
consensus_alt_seq = alt_seq | |||||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||||
consensus_call = 'filtered' | |||||
elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')): | |||||
consensus_call = 'pcr-free-speicifc' | |||||
elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')): | |||||
consensus_call = 'pcr-speicifc' | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||||
consensus_call = '0/0' | |||||
else: | |||||
consensus_call = 'filtered' | |||||
return pcr_consensus, pcr_free_consensus, mendelian_num, consensus_call, consensus_alt_seq, ALL_ALT, ALL_DP, detected_num | |||||
for row in merged_df.itertuples(): | |||||
mendelian_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_1_20191105,row.Quartet_DNA_BGI_T7_WGE_2_20191105,row.Quartet_DNA_BGI_T7_WGE_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_3_20170216] | |||||
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216] | |||||
lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216] | |||||
lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216] | |||||
lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216] | |||||
# LCL5 | |||||
LCL5_pcr_consensus, LCL5_pcr_free_consensus, LCL5_mendelian_num, LCL5_consensus_call, LCL5_consensus_alt_seq, LCL5_alt, LCL5_dp, LCL5_detected_num = consensus_call(lcl5_list,mendelian_list,row.ALT) | |||||
if LCL5_mendelian_num != '.': | |||||
LCL5_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL5_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL5_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL5_consensus_call + ':' + str(LCL5_alt) + ':' + str(LCL5_dp) + '\n' | |||||
benchmark_LCL5.write(LCL5_output) | |||||
# LCL6 | |||||
LCL6_pcr_consensus, LCL6_pcr_free_consensus, LCL6_mendelian_num, LCL6_consensus_call, LCL6_consensus_alt_seq, LCL6_alt, LCL6_dp, LCL6_detected_num = consensus_call(lcl6_list,mendelian_list,row.ALT) | |||||
if LCL6_mendelian_num != '.': | |||||
LCL6_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL6_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL6_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL6_consensus_call + ':' + str(LCL6_alt) + ':' + str(LCL6_dp) + '\n' | |||||
benchmark_LCL6.write(LCL6_output) | |||||
# LCL7 | |||||
LCL7_pcr_consensus, LCL7_pcr_free_consensus, LCL7_mendelian_num, LCL7_consensus_call, LCL7_consensus_alt_seq, LCL7_alt, LCL7_dp, LCL7_detected_num = consensus_call(lcl7_list,mendelian_list,row.ALT) | |||||
if LCL7_mendelian_num != '.': | |||||
LCL7_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL7_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL7_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL7_consensus_call + ':' + str(LCL7_alt) + ':' + str(LCL7_dp) + '\n' | |||||
benchmark_LCL7.write(LCL7_output) | |||||
# LCL8 | |||||
LCL8_pcr_consensus, LCL8_pcr_free_consensus, LCL8_mendelian_num, LCL8_consensus_call, LCL8_consensus_alt_seq, LCL8_alt, LCL8_dp, LCL8_detected_num = consensus_call(lcl8_list,mendelian_list,row.ALT) | |||||
if LCL8_mendelian_num != '.': | |||||
LCL8_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL8_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL8_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL8_consensus_call + ':' + str(LCL8_alt) + ':' + str(LCL8_dp) + '\n' | |||||
benchmark_LCL8.write(LCL8_output) | |||||
# all data | |||||
all_output = row.CHROM + '\t' + str(row.POS) + '\t' + LCL5_pcr_consensus + '\t' + LCL5_pcr_free_consensus + '\t' + str(LCL5_mendelian_num) + '\t' + LCL5_consensus_call + '\t' + LCL5_consensus_alt_seq + '\t' + str(LCL5_alt) + '\t' + str(LCL5_dp) + '\t' + str(LCL5_detected_num) + '\t' +\ | |||||
LCL6_pcr_consensus + '\t' + LCL6_pcr_free_consensus + '\t' + str(LCL6_mendelian_num) + '\t' + LCL6_consensus_call + '\t' + LCL6_consensus_alt_seq + '\t' + str(LCL6_alt) + '\t' + str(LCL6_dp) + '\t' + str(LCL6_detected_num) + '\t' +\ | |||||
LCL7_pcr_consensus + '\t' + LCL7_pcr_free_consensus + '\t' + str(LCL7_mendelian_num) + '\t' + LCL7_consensus_call + '\t' + LCL7_consensus_alt_seq + '\t' + str(LCL7_alt) + '\t' + str(LCL7_dp) + '\t' + str(LCL7_detected_num) + '\t' +\ | |||||
LCL8_pcr_consensus + '\t' + LCL8_pcr_free_consensus + '\t' + str(LCL8_mendelian_num) + '\t' + LCL8_consensus_call + '\t' + LCL8_consensus_alt_seq + '\t' + str(LCL8_alt) + '\t' + str(LCL8_dp) + '\t' + str(LCL8_detected_num) + '\n' | |||||
all_sample_outfile.write(all_output) | |||||
{ | { | ||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | ||||
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | "{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | ||||
"{{ project_name }}.mergeSister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL5mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | "{{ project_name }}.LCL5mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | ||||
"{{ project_name }}.test_name": "{{ test_name }}", | |||||
"{{ project_name }}.disk_size": "150", | "{{ project_name }}.disk_size": "150", | ||||
"{{ project_name }}.chromo": "{{ chromo }}", | |||||
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | "{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | ||||
"{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||||
"{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||||
"{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||||
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||||
"{{ project_name }}.LCL5variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||||
"{{ project_name }}.LCL8variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||||
"{{ project_name }}.cluster_config": "OnDemand bcs.b4.xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.vcf": "{{ vcf }}", | |||||
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | "{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | ||||
} | } | ||||
task VCFinfo { | |||||
File repeat_annotated_vcf | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/variants_quality_location_intergration.py -vcf ${repeat_annotated_vcf} -prefix ${sample} | |||||
cat ${sample}_variant_quality_location.txt | grep '#CHROM' > header | |||||
for i in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX | |||||
do | |||||
cat ${sample}_variant_quality_location.txt | grep -w $i | cat header - > ${sample}.$i.vcfInfo.txt | |||||
done | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File extracted_info = "${sample}_variant_quality_location.txt" | |||||
Array[File] chromo_vcfInfo = glob("*.vcfInfo.txt") | |||||
} | |||||
} |
task bed_annotation { | |||||
File merged_vcf_gz | |||||
File merged_vcf_idx | |||||
File repeat_bed | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf_gz} -o ${sample}.mendelian.merged.repeatAnno.vcf.gz | |||||
gunzip ${sample}.mendelian.merged.repeatAnno.vcf.gz | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File repeat_annotated_vcf = "${sample}.mendelian.merged.repeatAnno.vcf" | |||||
} | |||||
} |
task FinalResult { | |||||
File extracted_info | |||||
File annotated_txt | |||||
String prefix = basename(annotated_txt,".mendelian.txt") | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/FinalResult2VCF.py -vcfInfo ${extracted_info} -mendelianInfo ${annotated_txt} -prefix ${prefix} -sample ${sample} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File benchmarking_calls = "${prefix}_benchmarking_calls.vcf" | |||||
File all_info = "${prefix}_all_sample_information.vcf" | |||||
} | |||||
} |
task indelNorm { | |||||
File vcf | |||||
File ref_dir | |||||
String fasta | |||||
String sampleName | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
cat ${vcf} | grep '#' > header | |||||
cat ${vcf} | grep -v '#' > body | |||||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX\' > body.filtered | |||||
cat header body.filtered > ${sampleName}.filtered.vcf | |||||
/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File normed_vcf = "${sampleName}.normed.vcf" | |||||
} | |||||
} |
task merge { | task merge { | ||||
Array[File] family_vcf_gz | Array[File] family_vcf_gz | ||||
Array[File] family_vcf_idx | Array[File] family_vcf_idx | ||||
String test_name | |||||
String sample | String sample | ||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
command <<< | command <<< | ||||
rtg vcfmerge --force-merge-all -o ${sample}.merged.vcf.gz ${sep=" " family_vcf_gz} | rtg vcfmerge --force-merge-all -o ${sample}.merged.vcf.gz ${sep=" " family_vcf_gz} | ||||
rtg vcffilter -i ${sample}.merged.vcf.gz -o ${sample}.snv.merged.vcf.gz --snps-only --all-samples | |||||
rtg vcffilter -i ${sample}.merged.vcf.gz -o ${sample}.indel.merged.vcf.gz --non-snps-only --all-samples | |||||
zcat ${sample}.indel.merged.vcf.gz | grep '#CHROM' | cut -f10-12 > name | |||||
for i in {10..12}; do zcat ${sample}.snv.merged.vcf.gz | grep -v '#' | cut -f$i | cut -d ':' -f2-4 | grep -v '\.'| sort | uniq -c | awk '{print $1, substr($1,0,7)}' | sed 's/\s\+/\t/g' | cut -f1 > $i.snv.txt; done | |||||
paste *.snv.txt | cat name - > snv.txt | |||||
for i in {10..12}; do zcat ${sample}.indel.merged.vcf.gz | grep -v '#' | cut -f$i | cut -d ':' -f2-4 | grep -v '\.'| sort | uniq -c | awk '{print $1, substr($1,0,7)}' | sed 's/\s\+/\t/g' | cut -f1 > $i.indel.txt; done | |||||
paste *.indel.txt | cat name - > index.txt | |||||
echo 'type' > column | |||||
echo '0,0,0' >> column | |||||
echo '0,0,1' >> column | |||||
echo '0,1,0' >> column | |||||
echo '0,1,1' >> column | |||||
echo '1,0,0' >> column | |||||
echo '1,0,1' >> column | |||||
echo '1,1,0' >> column | |||||
echo '1,1,1' >> column | |||||
paste column snv.txt > ${test_name}.snv.txt | |||||
paste column indel.txt > ${test_name}.indel.txt | |||||
zcat ${sample}.merged.vcf.gz | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt | |||||
>>> | >>> | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File merged_vcf = "${sample}.merged.vcf.gz" | |||||
File merged_vcf_gz = "${sample}.merged.vcf.gz" | |||||
File merged_vcf_idx = "${sample}.merged.vcf.gz.tbi" | File merged_vcf_idx = "${sample}.merged.vcf.gz.tbi" | ||||
File merged_snv = "${sample}.snv.merged.vcf.gz" | |||||
File merged_snv_idx = "${sample}.snv.merged.vcf.gz.tbi" | |||||
File merged_indel = "${sample}.indel.merged.vcf.gz" | |||||
File merged_indel_idx = "${sample}.indel.merged.vcf.gz.tbi" | |||||
File snv = "${test_name}.snv.txt" | |||||
File indel = "${test_name}.indel.txt" | |||||
File vcf_dup = "${sample}.vcf_dup.txt" | |||||
} | } | ||||
} | } |
task mergeVCFInfo { | |||||
Array[File] vcf_gz | |||||
Array[File] vcf_idx | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
rtg vcfmerge --force-merge-all -o ${sample}.merged.info.vcf.gz ${sep=" " vcf_gz} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File merged_vcf = "${sample}.merged.info.vcf.gz" | |||||
File merged_vcf_idx = "${sample}.merged.info.vcf.gz.tbi" | |||||
} | |||||
} |
task merge_info { | |||||
File vcfInfo | |||||
File mendelianInfo | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/merge_mendelian_vcfinfo.py -vcfInfo ${vcfInfo} -mendelianInfo ${mendelianInfo} -sample ${sample} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File all_info = "${sample}_mendelian_vcfInfo.vcf" | |||||
} | |||||
} |
command <<< | command <<< | ||||
python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name} | python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name} | ||||
cat ${family_name}.LCL5.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL5.txt | |||||
cat ${family_name}.LCL6.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL6.txt | |||||
cat ${family_name}.LCL7.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL7.txt | |||||
cat ${family_name}.LCL8.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL8.txt | |||||
>>> | >>> | ||||
File LCL7_family_info = "${family_name}.LCL7.vcf" | File LCL7_family_info = "${family_name}.LCL7.vcf" | ||||
File LCL8_family_info = "${family_name}.LCL8.vcf" | File LCL8_family_info = "${family_name}.LCL8.vcf" | ||||
File family_info = "${family_name}.vcf" | File family_info = "${family_name}.vcf" | ||||
File LCL5_family_info_txt = "${family_name}.LCL5.txt" | |||||
File LCL6_family_info_txt = "${family_name}.LCL6.txt" | |||||
File LCL7_family_info_txt = "${family_name}.LCL7.txt" | |||||
File LCL8_family_info_txt = "${family_name}.LCL8.txt" | |||||
} | } | ||||
} | } | ||||
task two_family_merge { | |||||
File LCL5_trio_vcf | |||||
File LCL6_trio_vcf | |||||
String family_name | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
cat ${LCL5_trio_vcf} | grep -v '##' > ${family_name}.LCL5.txt | |||||
cat ${LCL6_trio_vcf} | grep -v '##' > ${family_name}.LCL6.txt | |||||
python /opt/merge_two_family.py -LCL5 ${family_name}.LCL5.txt -LCL6 ${family_name}.LCL6.txt -family ${family_name} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File family_mendelian_info = "${family_name}.txt" | |||||
} | |||||
} |
/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf | /opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf | ||||
cat ${sampleName}.normed.vcf | grep -v '##' > ${sampleName}.normed.txt | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
} | } | ||||
output { | output { | ||||
File normed_vcf = "${sampleName}.normed.vcf" | File normed_vcf = "${sampleName}.normed.vcf" | ||||
File normed_txt = "${sampleName}.normed.txt" | |||||
} | } | ||||
} | } |
task votes { | task votes { | ||||
File merged_vcf | |||||
File vcf_dup | |||||
String sample | |||||
String prefix | |||||
Array[File] family_mendelian_info | |||||
File vcf | |||||
String chromo | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
python /opt/high_confidence_call_vote.py -vcf ${merged_vcf} -dup ${vcf_dup} -sample ${sample} -prefix ${prefix} | |||||
cat ${prefix}_annotated.vcf | cut -f1-9,45 | grep -v 'filtered' | grep -v 'confirm for parents' | grep -v 'pcr-free-speicifc' | grep -v 'pcr-speicifc' | grep -v 'dupVar' > ${prefix}_bechmarking_calls.vcf | |||||
mkdir temp | |||||
for i in ${sep=" " family_mendelian_info} | |||||
do | |||||
cp $i temp | |||||
done | |||||
cat ${vcf} | grep -v '##' > vcf_info.txt | |||||
python /opt/voted_by_vcfinfo_mendelianinfo.py -folder ./temp -vcf vcf_info.txt | |||||
cp LCL5_voted.vcf LCL5.${chromo}.voted.vcf | |||||
cp LCL6_voted.vcf LCL6.${chromo}.voted.vcf | |||||
cp LCL7_voted.vcf LCL7.${chromo}.voted.vcf | |||||
cp LCL8_voted.vcf LCL8.${chromo}.voted.vcf | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File annotated_vcf = "${prefix}_annotated.vcf" | |||||
File benchmark_call = "${prefix}_bechmarking_calls.vcf" | |||||
File LCL5_voted_vcf = "LCL5.${chromo}.voted.vcf" | |||||
File LCL6_voted_vcf = "LCL6.${chromo}.voted.vcf" | |||||
File LCL7_voted_vcf = "LCL7.${chromo}.voted.vcf" | |||||
File LCL8_voted_vcf = "LCL8.${chromo}.voted.vcf" | |||||
File all_sample_info = "all_sample_information.txt" | |||||
} | } | ||||
} | } | ||||
task zipIndex { | task zipIndex { | ||||
File vcf | File vcf | ||||
String sample | |||||
String family_name | |||||
String vcf_name = basename(vcf,".vcf") | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
rtg bgzip ${vcf} -c > ${family_name}.${sample}.vcf.gz | |||||
rtg index -f vcf ${family_name}.${sample}.vcf.gz | |||||
rtg bgzip ${vcf} -c > ${vcf_name}.vcf.gz | |||||
rtg index -f vcf ${vcf_name}.vcf.gz | |||||
>>> | >>> | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File vcf_gz = "${family_name}.${sample}.vcf.gz" | |||||
File vcf_idx = "${family_name}.${sample}.vcf.gz.tbi" | |||||
File vcf_gz = "${vcf_name}.vcf.gz" | |||||
File vcf_idx = "${vcf_name}.vcf.gz.tbi" | |||||
} | } | ||||
} | } |
import "./tasks/variantsNorm.wdl" as variantsNorm | |||||
import "./tasks/mendelian.wdl" as mendelian | import "./tasks/mendelian.wdl" as mendelian | ||||
import "./tasks/zipIndex.wdl" as zipIndex | |||||
import "./tasks/VCFrename.wdl" as VCFrename | |||||
import "./tasks/mergeSister.wdl" as mergeSister | |||||
import "./tasks/reformVCF.wdl" as reformVCF | |||||
import "./tasks/merge.wdl" as merge | |||||
import "./tasks/votes.wdl" as votes | |||||
workflow {{ project_name }} { | workflow {{ project_name }} { | ||||
File inputSamplesFile | File inputSamplesFile | ||||
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | ||||
File ref_dir | File ref_dir | ||||
File vcf | |||||
String chromo | |||||
String fasta | String fasta | ||||
String test_name | |||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
scatter (quartet in inputSamples){ | scatter (quartet in inputSamples){ | ||||
call variantsNorm.variantsNorm as LCL5variantsNorm{ | |||||
input: | |||||
vcf=quartet[0], | |||||
ref_dir=ref_dir, | |||||
fasta=fasta, | |||||
sampleName=quartet[4], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call variantsNorm.variantsNorm as LCL6variantsNorm{ | |||||
input: | |||||
vcf=quartet[1], | |||||
ref_dir=ref_dir, | |||||
fasta=fasta, | |||||
sampleName=quartet[5], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call variantsNorm.variantsNorm as LCL7variantsNorm{ | |||||
input: | |||||
vcf=quartet[2], | |||||
ref_dir=ref_dir, | |||||
fasta=fasta, | |||||
sampleName=quartet[6], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call variantsNorm.variantsNorm as LCL8variantsNorm{ | |||||
input: | |||||
vcf=quartet[3], | |||||
ref_dir=ref_dir, | |||||
fasta=fasta, | |||||
sampleName=quartet[7], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call mendelian.mendelian as LCL5mendelian { | call mendelian.mendelian as LCL5mendelian { | ||||
input: | input: | ||||
child_vcf=LCL5variantsNorm.normed_vcf, | |||||
LCL7_vcf=LCL7variantsNorm.normed_vcf, | |||||
LCL8_vcf=LCL8variantsNorm.normed_vcf, | |||||
child_vcf=quartet[0], | |||||
LCL7_vcf=quartet[2], | |||||
LCL8_vcf=quartet[3], | |||||
LCL7_name=quartet[6], | LCL7_name=quartet[6], | ||||
LCL8_name=quartet[7], | LCL8_name=quartet[7], | ||||
child_name=quartet[4], | child_name=quartet[4], | ||||
} | } | ||||
call mendelian.mendelian as LCL6mendelian { | call mendelian.mendelian as LCL6mendelian { | ||||
input: | input: | ||||
child_vcf=LCL6variantsNorm.normed_vcf, | |||||
LCL7_vcf=LCL7variantsNorm.normed_vcf, | |||||
LCL8_vcf=LCL8variantsNorm.normed_vcf, | |||||
child_vcf=quartet[1], | |||||
LCL7_vcf=quartet[2], | |||||
LCL8_vcf=quartet[3], | |||||
LCL7_name=quartet[6], | LCL7_name=quartet[6], | ||||
LCL8_name=quartet[7], | LCL8_name=quartet[7], | ||||
child_name=quartet[5], | child_name=quartet[5], | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call zipIndex.zipIndex as LCL5zipIndex { | |||||
input: | |||||
vcf=LCL5mendelian.trio_vcf, | |||||
sample="LCL5", | |||||
family_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call zipIndex.zipIndex as LCL6zipIndex { | |||||
input: | |||||
vcf=LCL6mendelian.trio_vcf, | |||||
sample="LCL6", | |||||
family_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call VCFrename.VCFrename as LCL5VCFrename { | |||||
input: | |||||
trio_vcf_gz=LCL5zipIndex.vcf_gz, | |||||
trio_vcf_idx=LCL5zipIndex.vcf_idx, | |||||
mother_name=quartet[7], | |||||
father_name=quartet[6], | |||||
child_name=quartet[4], | |||||
family_name=quartet[8], | |||||
child="LCL5", | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call VCFrename.VCFrename as LCL6VCFrename { | |||||
input: | |||||
trio_vcf_gz=LCL6zipIndex.vcf_gz, | |||||
trio_vcf_idx=LCL6zipIndex.vcf_idx, | |||||
mother_name=quartet[7], | |||||
father_name=quartet[6], | |||||
child_name=quartet[5], | |||||
family_name=quartet[8], | |||||
child="LCL6", | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call mergeSister.mergeSister as mergeSister { | |||||
input: | |||||
LCL5_trio_vcf_gz=LCL5VCFrename.rename_trio_vcf_gz, | |||||
LCL5_trio_vcf_idx=LCL5VCFrename.rename_trio_vcf_idx, | |||||
LCL6_trio_vcf_gz=LCL6VCFrename.rename_trio_vcf_gz, | |||||
LCL6_trio_vcf_idx=LCL6VCFrename.rename_trio_vcf_idx, | |||||
family_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call reformVCF.reformVCF as reformVCF { | |||||
input: | |||||
family_mendelian_info=mergeSister.family_mendelian_info, | |||||
family_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call zipIndex.zipIndex as LCL5familyzipIndex { | |||||
input: | |||||
vcf=reformVCF.LCL5_family_info, | |||||
sample='LCL5', | |||||
family_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call zipIndex.zipIndex as LCL6familyzipIndex { | |||||
input: | |||||
vcf=reformVCF.LCL6_family_info, | |||||
sample='LCL6', | |||||
family_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call zipIndex.zipIndex as LCL7familyzipIndex { | |||||
input: | |||||
vcf=reformVCF.LCL7_family_info, | |||||
sample='LCL7', | |||||
family_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call zipIndex.zipIndex as LCL8familyzipIndex { | |||||
input: | |||||
vcf=reformVCF.LCL8_family_info, | |||||
sample='LCL8', | |||||
family_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
} | } | ||||
call merge.merge as LCL5merge { | |||||
input: | |||||
family_vcf_gz=LCL5familyzipIndex.vcf_gz, | |||||
family_vcf_idx=LCL5familyzipIndex.vcf_idx, | |||||
sample="LCL5", | |||||
test_name=test_name, | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
} | } |