Explorar el Código

first commit

master
LUYAO REN hace 5 años
commit
3c47bf15dd
Se han modificado 37 ficheros con 2176 adiciones y 0 borrados
  1. +103
    -0
      README.md
  2. BIN
      codescripts/.DS_Store
  3. +37
    -0
      codescripts/Indel_bed.py
  4. +72
    -0
      codescripts/bed_region.py
  5. +67
    -0
      codescripts/cluster.sh
  6. +92
    -0
      codescripts/extract_vcf_information.py
  7. +47
    -0
      codescripts/filter_indel_over_50.py
  8. +416
    -0
      codescripts/high_confidence_call_vote.py
  9. +5
    -0
      codescripts/linux_command.sh
  10. +129
    -0
      codescripts/merge_mendelian_vcfinfo.py
  11. +71
    -0
      codescripts/merge_two_family.py
  12. +109
    -0
      codescripts/oneClass.py
  13. +144
    -0
      codescripts/reformVCF.py
  14. +36
    -0
      codescripts/vcf2bed.py
  15. +295
    -0
      codescripts/voted_by_vcfinfo_mendelianinfo.py
  16. +1
    -0
      family.tsv
  17. +8
    -0
      inputs
  18. BIN
      pictures/.DS_Store
  19. BIN
      pictures/workflow.png
  20. BIN
      tasks/.DS_Store
  21. +32
    -0
      tasks/VCFinfo.wdl
  22. +31
    -0
      tasks/VCFrename.wdl
  23. +27
    -0
      tasks/bed_annotation.wdl
  24. +26
    -0
      tasks/final_result.wdl
  25. +34
    -0
      tasks/mendelian.wdl
  26. +28
    -0
      tasks/merge.wdl
  27. +34
    -0
      tasks/mergeSister.wdl
  28. +25
    -0
      tasks/mergeVCFInfo.wdl
  29. +24
    -0
      tasks/merge_info.wdl
  30. +39
    -0
      tasks/oneClass.wdl
  31. +38
    -0
      tasks/reformVCF.wdl
  32. +51
    -0
      tasks/sister.wdl
  33. +26
    -0
      tasks/two_family_merge.wdl
  34. +33
    -0
      tasks/variantsNorm.wdl
  35. +41
    -0
      tasks/votes.wdl
  36. +24
    -0
      tasks/zipIndex.wdl
  37. +31
    -0
      workflow.wdl

+ 103
- 0
README.md Ver fichero

#高置信突变位点的整合

> Author: Run Luyao
>
> E-mail:18110700050@fudan.edu.cn
>
> Git:http://choppy.3steps.cn/renluyao/high_confidence_calls_manuscript.git
>
> Last Updates: 18/03/2020

## 安装指南

```bash
# 激活choppy环境
source activate choppy
# 安装app
choppy install renluyao/high_confidence_calls_manuscript
```

## App概述

中华家系1号全基因组高置信small variants(SNVs和Indels)的整合流程。



## 流程与参数

####1. variantsNorm

保留chr1-22,X上的突变

用bcftools norm进行突变格式的统一

####2. mendelian

LCL5、LCL7和LCL8为三口之家,进行trio-analysis,分析符合孟德尔和不符合孟德尔遗传规律的突变位点

LCL6、LCL7和LCL8为三口之家,进行trio-analysis,分析符合孟德尔和不符合孟德尔遗传规律的突变位点

得到LCL5和LCL6两个家系合并的vcf文件

####3. zipIndex

对LCL5和LCL6两个家系合并的文件压缩和检索引

####4. VCFrename

将VBT的输出结果,VCF文件中的MOTHER FATHER CHILD改成对应的样本名

####5. mergeSister

将LCL5和LCL6修改过名字后的家系VCF文件合并

####6. reformVCF

根据两个三口之家和姐妹的孟德尔遗传的信息,将之前和合并的VCF分解成4个人的vcf,并且包含了家系遗传的信息

####7. familyzipIndex

将上一步输出的4个文件进行压缩和检索引

####8. merge

将所有注释后的LCL5 vcf文件合并

将所有注释后的LCL6 vcf文件合并

将所有注释后的LCL7 vcf文件合并

将所有注释后的LCL8 vcf文件合并

####9. vote

投票选择高置信的突变位点

t

## App输入变量与输入文件

inputSamplesFile的格式如下

```bash
#LCL5_VCF #LCL6_VCF #LCL7_VCF #LCL8_VCF #LCL5_sampleName #LCL6_sampleName #LCL7_sampleName #LCL8_sampleName #familyName
```

最终版的整合文件包括:



## App输出文件



## 结果展示与解读



## CHANGELOG



## FAQ


BIN
codescripts/.DS_Store Ver fichero


+ 37
- 0
codescripts/Indel_bed.py Ver fichero

import pandas as pd
import sys, argparse, os
mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/MIE/vcf/mutation_type',header=None)
outIndel = open(sys.argv[1],'w')
for row in mut.itertuples():
if ',' in row._4:
alt_seq = row._4.split(',')
alt_len = [len(i) for i in alt_seq]
alt = max(alt_len)
else:
alt = len(row._4)
ref = row._3
pos = row._2
if len(ref) == 1 and alt == 1:
pass
elif len(ref) > alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(ref) - 1)
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
elif alt > len(ref):
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
elif len(ref) == alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)








+ 72
- 0
codescripts/bed_region.py Ver fichero

import pandas as pd
import sys, argparse, os
mut = mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/vcf/mutation_type',header=None)
vote = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/benchmark.vote.mendelian.txt',header=None)
merged_df = pd.merge(vote, mut, how='inner', left_on=[0,1], right_on = [0,1])
outFile = open(sys.argv[1],'w')
outIndel = open(sys.argv[2],'w')
for row in merged_df.itertuples():
#d5
if ',' in row._7:
d5 = row._7.split(',')
d5_len = [len(i) for i in d5]
d5_alt = max(d5_len)
else:
d5_alt = len(row._7)
#d6
if ',' in row._15:
d6 = row._15.split(',')
d6_len = [len(i) for i in d6]
d6_alt = max(d6_len)
else:
d6_alt = len(row._15)
#f7
if ',' in row._23:
f7 = row._23.split(',')
f7_len = [len(i) for i in f7]
f7_alt = max(f7_len)
else:
f7_alt = len(row._23)
#m8
if ',' in row._31:
m8 = row._31.split(',')
m8_len = [len(i) for i in m8]
m8_alt = max(m8_len)
else:
m8_alt = len(row._31)
all_length = [d5_alt,d6_alt,f7_alt,m8_alt]
alt = max(all_length)
ref = row._35
pos = int(row._2)
if len(ref) == 1 and alt == 1:
StartPos = int(pos) -1
EndPos = int(pos)
cate = 'SNV'
elif len(ref) > alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(ref) - 1)
cate = 'INDEL'
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
elif alt > len(ref):
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
cate = 'INDEL'
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
elif len(ref) == alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
cate = 'INDEL'
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
outline = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row._2) + '\t' + cate + '\n'
outFile.write(outline)









+ 67
- 0
codescripts/cluster.sh Ver fichero

cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '0/0' > LCL5.body
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '0/0' > LCL6.body
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '0/0'> LCL7.body
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}'| grep -v '0/0' > LCL8.body
cat header5 LCL5.body > LCL5.beforediffbed.vcf
cat header6 LCL6.body > LCL6.beforediffbed.vcf
cat header7 LCL7.body > LCL7.beforediffbed.vcf
cat header8 LCL8.body > LCL8.beforediffbed.vcf
rtg bgzip *beforediffbed.vcf
rtg index *beforediffbed.vcf.gz

rtg vcffilter -i LCL5.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL5.afterfilterdiffbed.vcf.gz
rtg vcffilter -i LCL6.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL6.afterfilterdiffbed.vcf.gz
rtg vcffilter -i LCL7.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL7.afterfilterdiffbed.vcf.gz
rtg vcffilter -i LCL8.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL8.afterfilterdiffbed.vcf.gz

/mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/table_annovar.pl LCL5.beforediffbed.vcf.gz /mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/humandb \
-buildver hg38 \
-out LCL5 \
-remove \
-protocol 1000g2015aug_all,1000g2015aug_afr,1000g2015aug_amr,1000g2015aug_eas,1000g2015aug_eur,1000g2015aug_sas,clinvar_20190305,gnomad211_genome \
-operation f,f,f,f,f,f,f,f \
-nastring . \
-vcfinput \
--thread 8

rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL5.afterfilterdiffbed.vcf.gz -o LCL5_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL6.afterfilterdiffbed.vcf.gz -o LCL6_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL7.afterfilterdiffbed.vcf.gz -o LCL7_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL8.afterfilterdiffbed.vcf.gz -o LCL8_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/


zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l


zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l


zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l

bedtools subtract -a LCL5.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL5.27.homo_ref.consensus.filtereddiffbed.bed
bedtools subtract -a LCL6.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL6.27.homo_ref.consensus.filtereddiffbed.bed
bedtools subtract -a LCL7.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL7.27.homo_ref.consensus.filtereddiffbed.bed
bedtools subtract -a LCL8.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL8.27.homo_ref.consensus.filtereddiffbed.bed


python vcf2bed.py LCL5.body LCL5.variants.bed
python vcf2bed.py LCL6.body LCL6.variants.bed
python vcf2bed.py LCL7.body LCL7.variants.bed
python vcf2bed.py LCL8.body LCL8.variants.bed



cat /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/LCL5.variants.bed | cut -f1,11,12 | cat - LCL5.27.homo_ref.consensus.filtereddiffbed.bed | sort -k1,1 -k2,2n > LCL5.high.confidence.bed





+ 92
- 0
codescripts/extract_vcf_information.py Ver fichero

import sys,getopt
import os
import re
import fileinput
import pandas as pd

def usage():
print(
"""
Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file

This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.

Input:
-i the selected vcf file

Output:
-o tab-delimited parsed file
""")

# select supported small variants
def process(oneLine):
line = oneLine.rstrip()
strings = line.strip().split('\t')
infoParsed = parse_INFO(strings[7])
formatKeys = strings[8].split(':')
formatValues = strings[9].split(':')
for i in range(0,len(formatKeys) -1) :
if formatKeys[i] == 'AD':
ra = formatValues[i].split(',')
infoParsed['RefDP'] = ra[0]
infoParsed['AltDP'] = ra[1]
if (int(ra[1]) + int(ra[0])) != 0:
infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
else:
pass
else:
infoParsed[formatKeys[i]] = formatValues[i]
infoParsed['chromo'] = strings[0]
infoParsed['pos'] = strings[1]
infoParsed['id'] = strings[2]
infoParsed['ref'] = strings[3]
infoParsed['alt'] = strings[4]
infoParsed['qual'] = strings[5]
return infoParsed


def parse_INFO(info):
strings = info.strip().split(';')
keys = []
values = []
for i in strings:
kv = i.split('=')
if kv[0] == 'DB':
keys.append('DB')
values.append('1')
elif kv[0] == 'AF':
pass
else:
keys.append(kv[0])
values.append(kv[1])
infoDict = dict(zip(keys, values))
return infoDict

opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
for op,value in opts:
if op == "-i":
inputFile=value
elif op == "-o":
outputFile=value
elif op == "-h":
usage()
sys.exit()

if len(sys.argv[1:]) < 3:
usage()
sys.exit()

allDict = []
for line in fileinput.input(inputFile):
m = re.match('^\#',line)
if m is not None:
pass
else:
oneDict = process(line)
allDict.append(oneDict)

allTable = pd.DataFrame(allDict)

allTable.to_csv(outputFile,sep='\t',index=False)


+ 47
- 0
codescripts/filter_indel_over_50.py Ver fichero

import sys,getopt
from itertools import islice

over_50_outfile = open("indel_lenth_over_50.txt",'w')
less_50_outfile = open("benchmark.men.vote.diffbed.lengthlessthan50.txt","w")

def process(line):
strings = line.strip().split('\t')
#d5
if ',' in strings[6]:
d5 = strings[6].split(',')
d5_len = [len(i) for i in d5]
d5_alt = max(d5_len)
else:
d5_alt = len(strings[6])
#d6
if ',' in strings[14]:
d6 = strings[14].split(',')
d6_len = [len(i) for i in d6]
d6_alt = max(d6_len)
else:
d6_alt = len(strings[14])
#f7
if ',' in strings[22]:
f7 = strings[22].split(',')
f7_len = [len(i) for i in f7]
f7_alt = max(f7_len)
else:
f7_alt = len(strings[22])
#m8
if ',' in strings[30]:
m8 = strings[30].split(',')
m8_len = [len(i) for i in m8]
m8_alt = max(m8_len)
else:
m8_alt = len(strings[30])
#ref
ref_len = len(strings[34])
if (d5_alt > 50) or (d6_alt > 50) or (f7_alt > 50) or (m8_alt > 50) or (ref_len > 50):
over_50_outfile.write(line)
else:
less_50_outfile.write(line)


input_file = open(sys.argv[1])
for line in islice(input_file, 1, None):
process(line)

+ 416
- 0
codescripts/high_confidence_call_vote.py Ver fichero

from __future__ import division
import sys, argparse, os
import fileinput
import re
import pandas as pd
from operator import itemgetter
from collections import Counter
from itertools import islice
from numpy import *
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to count voting number")

parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
multi_sample_vcf = args.multi_sample_vcf
dup_list = args.dup_list
prefix = args.prefix
sample_name = args.sample_name

vcf_header = '''##fileformat=VCFv4.2
##fileDate=20200331
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region">
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes">
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes">
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples">
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples">
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency, sum alternative depth / sum depth">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Average genotype quality">
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth">
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Average mapping quality">
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias">
##FORMAT=<ID=QUALI,Number=1,Type=Float,Description="Average variant quality">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''

vcf_header_all_sample = '''##fileformat=VCFv4.2
##fileDate=20200331
##reference=GRCh38.d1.vd1
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region">
##INFO=<ID=DUP,Number=1,Type=Flag,Description="Duplicated variant records">
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes">
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes">
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
##INFO=<ID=ALL_ALT,Number=1,Type=Float,Description="Sum of alternative reads of all samples">
##INFO=<ID=ALL_DP,Number=1,Type=Float,Description="Sum of depth of all samples">
##INFO=<ID=ALL_AF,Number=1,Type=Float,Description="Allele frequency of net alternatice reads and net depth">
##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of genotype quality of all samples">
##INFO=<ID=QD_MEAN,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth">
##INFO=<ID=MQ_MEAN,Number=1,Type=Float,Description="Mean of mapping quality of all samples">
##INFO=<ID=FS_MEAN,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=QUAL_MEAN,Number=1,Type=Float,Description="Average variant quality">
##INFO=<ID=PCR,Number=1,Type=String,Description="Consensus of PCR votes">
##INFO=<ID=PCR_FREE,Number=1,Type=String,Description="Consensus of PCR-free votes">
##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Consensus calls">
##INFO=<ID=CONSENSUS_SEQ,Number=1,Type=String,Description="Consensus sequence">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=String,Description="Depth">
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth">
##FORMAT=<ID=AF,Number=1,Type=String,Description="Allele frequency">
##FORMAT=<ID=GQ,Number=1,Type=String,Description="Genotype quality">
##FORMAT=<ID=MQ,Number=1,Type=String,Description="Mapping quality">
##FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 is twins shared, 0 is twins discordant ">
##FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL5 mendelian consistent, 0 is mendelian vioaltion">
##FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL6 mendelian consistent, 0 is mendelian vioaltion">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# read in duplication list
dup = pd.read_table(dup_list,header=None)
var_dup = dup[0].tolist()

# output file
benchmark_file_name = prefix + '_voted.vcf'
benchmark_outfile = open(benchmark_file_name,'w')

all_sample_file_name = prefix + '_all_sample_information.vcf'
all_sample_outfile = open(all_sample_file_name,'w')

# write VCF
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '_benchmark_calls\n'
benchmark_outfile.write(vcf_header)
benchmark_outfile.write(outputcolumn)

outputcolumn_all_sample = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+ \
'Quartet_DNA_BGI_SEQ2000_BGI_1_20180518\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530\t' + \
'Quartet_DNA_BGI_T7_WGE_1_20191105\tQuartet_DNA_BGI_T7_WGE_2_20191105\tQuartet_DNA_BGI_T7_WGE_3_20191105\t' + \
'Quartet_DNA_ILM_Nova_ARD_1_20181108\tQuartet_DNA_ILM_Nova_ARD_2_20181108\tQuartet_DNA_ILM_Nova_ARD_3_20181108\t' + \
'Quartet_DNA_ILM_Nova_ARD_4_20190111\tQuartet_DNA_ILM_Nova_ARD_5_20190111\tQuartet_DNA_ILM_Nova_ARD_6_20190111\t' + \
'Quartet_DNA_ILM_Nova_BRG_1_20180930\tQuartet_DNA_ILM_Nova_BRG_2_20180930\tQuartet_DNA_ILM_Nova_BRG_3_20180930\t' + \
'Quartet_DNA_ILM_Nova_WUX_1_20190917\tQuartet_DNA_ILM_Nova_WUX_2_20190917\tQuartet_DNA_ILM_Nova_WUX_3_20190917\t' + \
'Quartet_DNA_ILM_XTen_ARD_1_20170403\tQuartet_DNA_ILM_XTen_ARD_2_20170403\tQuartet_DNA_ILM_XTen_ARD_3_20170403\t' + \
'Quartet_DNA_ILM_XTen_NVG_1_20170329\tQuartet_DNA_ILM_XTen_NVG_2_20170329\tQuartet_DNA_ILM_XTen_NVG_3_20170329\t' + \
'Quartet_DNA_ILM_XTen_WUX_1_20170216\tQuartet_DNA_ILM_XTen_WUX_2_20170216\tQuartet_DNA_ILM_XTen_WUX_3_20170216\n'
all_sample_outfile.write(vcf_header_all_sample)
all_sample_outfile.write(outputcolumn_all_sample)



#function
def replace_nan(strings_list):
updated_list = []
for i in strings_list:
if i == '.':
updated_list.append('.:.:.:.:.:.:.:.:.:.:.:.')
else:
updated_list.append(i)
return updated_list

def remove_dot(strings_list):
updated_list = []
for i in strings_list:
if i == '.':
pass
else:
updated_list.append(i)
return updated_list

def detected_number(strings):
gt = [x.split(':')[0] for x in strings]
percentage = 27 - gt.count('.')
return(str(percentage))

def vote_number(strings,consensus_call):
gt = [x.split(':')[0] for x in strings]
gt = [x.replace('.','0/0') for x in gt]
gt = list(map(gt_uniform,[i for i in gt]))
vote_num = gt.count(consensus_call)
return(str(vote_num))

def family_vote(strings,consensus_call):
gt = [x.split(':')[0] for x in strings]
gt = [x.replace('.','0/0') for x in gt]
gt = list(map(gt_uniform,[i for i in gt]))
mendelian = [':'.join(x.split(':')[1:4]) for x in strings]
indices = [i for i, x in enumerate(gt) if x == consensus_call]
matched_mendelian = itemgetter(*indices)(mendelian)
mendelian_num = matched_mendelian.count('1:1:1')
return(str(mendelian_num))

def gt_uniform(strings):
uniformed_gt = ''
allele1 = strings.split('/')[0]
allele2 = strings.split('/')[1]
if int(allele1) > int(allele2):
uniformed_gt = allele2 + '/' + allele1
else:
uniformed_gt = allele1 + '/' + allele2
return uniformed_gt

def decide_by_rep(strings):
consensus_rep = ''
mendelian = [':'.join(x.split(':')[1:4]) for x in strings]
gt = [x.split(':')[0] for x in strings]
gt = [x.replace('.','0/0') for x in gt]
# modified gt turn 2/1 to 1/2
gt = list(map(gt_uniform,[i for i in gt]))
# mendelian consistent?
mendelian_dict = Counter(mendelian)
highest_mendelian = mendelian_dict.most_common(1)
candidate_mendelian = highest_mendelian[0][0]
freq_mendelian = highest_mendelian[0][1]
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
gt_num_dict = Counter(gt)
highest_gt = gt_num_dict.most_common(1)
candidate_gt = highest_gt[0][0]
freq_gt = highest_gt[0][1]
if (candidate_gt != '0/0') and (freq_gt >= 2):
consensus_rep = candidate_gt
elif (candidate_gt == '0/0') and (freq_gt >= 2):
consensus_rep = '0/0'
else:
consensus_rep = 'inconGT'
elif (candidate_mendelian == '') and (freq_mendelian >= 2):
consensus_rep = 'noInfo'
else:
consensus_rep = 'inconMen'
return consensus_rep


def main():
for line in fileinput.input(multi_sample_vcf):
headline = re.match('^\#',line)
if headline is not None:
pass
else:
line = line.strip()
strings = line.split('\t')
variant_id = '_'.join([strings[0],strings[1]])
# check if the variants location is duplicated
if variant_id in var_dup:
strings[7] = strings[7] + ';DUP'
outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(outLine)
else:
# pre-define
pcr_consensus = '.'
pcr_free_consensus = '.'
consensus_call = '.'
consensus_alt_seq = '.'
# pcr
strings[9:] = replace_nan(strings[9:])
pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings)
SEQ2000 = decide_by_rep(pcr[0:3])
XTen_ARD = decide_by_rep(pcr[3:6])
XTen_NVG = decide_by_rep(pcr[6:9])
XTen_WUX = decide_by_rep(pcr[9:12])
sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
sequence_dict = Counter(sequence_site)
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 2:
pcr_consensus = candidate_sequence
else:
pcr_consensus = 'inconSequenceSite'
# pcr-free
pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings)
T7_WGE = decide_by_rep(pcr_free[0:3])
Nova_ARD_1 = decide_by_rep(pcr_free[3:6])
Nova_ARD_2 = decide_by_rep(pcr_free[6:9])
Nova_BRG = decide_by_rep(pcr_free[9:12])
Nova_WUX = decide_by_rep(pcr_free[12:15])
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 3:
pcr_free_consensus = candidate_sequence
else:
pcr_free_consensus = 'inconSequenceSite'
# pcr and pcr-free
tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
consensus_call = pcr_consensus
VOTED = vote_number(strings[9:],consensus_call)
strings[7] = strings[7] + ';VOTED=' + VOTED
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
FAM = family_vote(strings[9:],consensus_call)
strings[7] = strings[7] + ';FAM=' + FAM
# Delete multiple alternative genotype to necessary expression
alt = strings[4]
alt_gt = alt.split(',')
if len(alt_gt) > 1:
allele1 = consensus_call.split('/')[0]
allele2 = consensus_call.split('/')[1]
if allele1 == '0':
allele2_seq = alt_gt[int(allele2) - 1]
consensus_alt_seq = allele2_seq
consensus_call = '0/1'
else:
allele1_seq = alt_gt[int(allele1) - 1]
allele2_seq = alt_gt[int(allele2) - 1]
if int(allele1) > int(allele2):
consensus_alt_seq = allele2_seq + ',' + allele1_seq
consensus_call = '1/2'
elif int(allele1) < int(allele2):
consensus_alt_seq = allele1_seq + ',' + allele2_seq
consensus_call = '1/2'
else:
consensus_alt_seq = allele1_seq
consensus_call = '1/1'
else:
consensus_alt_seq = alt
# GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL
# GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT
# DP
DP = [x.split(':')[4] for x in strings[9:]]
DP = remove_dot(DP)
DP = [int(x) for x in DP]
ALL_DP = sum(DP)
# AF
ALT = [x.split(':')[5] for x in strings[9:]]
ALT = remove_dot(ALT)
ALT = [int(x) for x in ALT]
ALL_ALT = sum(ALT)
ALL_AF = round(ALL_ALT/ALL_DP,2)
# GQ
GQ = [x.split(':')[7] for x in strings[9:]]
GQ = remove_dot(GQ)
GQ = [int(x) for x in GQ]
GQ_MEAN = round(mean(GQ),2)
# QD
QD = [x.split(':')[8] for x in strings[9:]]
QD = remove_dot(QD)
QD = [float(x) for x in QD]
QD_MEAN = round(mean(QD),2)
# MQ
MQ = [x.split(':')[9] for x in strings[9:]]
MQ = remove_dot(MQ)
MQ = [float(x) for x in MQ]
MQ_MEAN = round(mean(MQ),2)
# FS
FS = [x.split(':')[10] for x in strings[9:]]
FS = remove_dot(FS)
FS = [float(x) for x in FS]
FS_MEAN = round(mean(FS),2)
# QUAL
QUAL = [x.split(':')[11] for x in strings[9:]]
QUAL = remove_dot(QUAL)
QUAL = [float(x) for x in QUAL]
QUAL_MEAN = round(mean(QUAL),2)
# benchmark output
output_format = consensus_call + ':' + str(ALL_DP) + ':' + str(ALL_ALT) + ':' + str(ALL_AF) + ':' + str(GQ_MEAN) + ':' + str(QD_MEAN) + ':' + str(MQ_MEAN) + ':' + str(FS_MEAN) + ':' + str(QUAL_MEAN)
outLine = strings[0] + '\t' + strings[1] + '\t' + strings[2] + '\t' + strings[3] + '\t' + consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' + strings[7] + '\t' + 'GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + output_format + '\n'
benchmark_outfile.write(outLine)
# all sample output
strings[7] = strings[7] + ';ALL_ALT=' + str(ALL_ALT) + ';ALL_DP=' + str(ALL_DP) + ';ALL_AF=' + str(ALL_AF) \
+ ';GQ_MEAN=' + str(GQ_MEAN) + ';QD_MEAN=' + str(QD_MEAN) + ';MQ_MEAN=' + str(MQ_MEAN) + ';FS_MEAN=' + str(FS_MEAN) \
+ ';QUAL_MEAN=' + str(QUAL_MEAN) + ';PCR=' + consensus_call + ';PCR_FREE=' + consensus_call + ';CONSENSUS=' + consensus_call \
+ ';CONSENSUS_SEQ=' + consensus_alt_seq
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
consensus_call = 'filtered'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
consensus_call = 'pcr-free-speicifc'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
consensus_call = 'pcr-speicifc'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call + ';PCR=' + pcr_consensus + ';PCR_FREE=' + pcr_free_consensus
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
consensus_call = 'confirm for parents'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
else:
consensus_call = 'filtered'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)

if __name__ == '__main__':
main()













+ 5
- 0
codescripts/linux_command.sh Ver fichero

cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '2_y' > LCL5.body
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '2_y' > LCL6.body
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '2_y' > LCL7.body
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}' | grep -v '2_y' > LCL8.body


+ 129
- 0
codescripts/merge_mendelian_vcfinfo.py Ver fichero

from __future__ import division
import pandas as pd
import sys, argparse, os
import fileinput
import re

# input arguments
parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates")

parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True)
parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True)
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)


args = parser.parse_args()
vcfInfo = args.vcfInfo
mendelianInfo = args.mendelianInfo
sample_name = args.sample_name


#GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL

vcf_header = '''##fileformat=VCFv4.2
##fileDate=20200331
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=TWINS,Number=1,Type=Flag,Description="1 for sister consistent, 0 for sister different">
##FORMAT=<ID=TRIO5,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation">
##FORMAT=<ID=TRIO6,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth">
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality">
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality">
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="variant quality">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''

# output file
file_name = sample_name + '_mendelian_vcfInfo.vcf'
outfile = open(file_name,'w')

outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '\n'
outfile.write(vcf_header)
outfile.write(outputcolumn)

# input files
vcf_info = pd.read_table(vcfInfo)
mendelian_info = pd.read_table(mendelianInfo)

merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])
merged_df = merged_df.fillna('.')

#
def parse_INFO(info):
strings = info.strip().split(';')
keys = []
values = []
for i in strings:
kv = i.split('=')
if kv[0] == 'DB':
keys.append('DB')
values.append('1')
else:
keys.append(kv[0])
values.append(kv[1])
infoDict = dict(zip(keys, values))
return infoDict
#
for row in merged_df.itertuples():
if row[18] != '.':
# format
# GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL
FORMAT_x = row[10].split(':')
ALT = int(FORMAT_x[1].split(',')[1])
if int(FORMAT_x[2]) != 0:
AF = round(ALT/int(FORMAT_x[2]),2)
else:
AF = '.'
INFO_x = parse_INFO(row.INFO_x)
if FORMAT_x[2] == '0':
INFO_x['QD'] = '.'
else:
pass
FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x)
# outline
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n'
else:
rawGT = row[10].split(':')
FORMAT_x = row[10].split(':')
ALT = int(FORMAT_x[1].split(',')[1])
if int(FORMAT_x[2]) != 0:
AF = round(ALT/int(FORMAT_x[2]),2)
else:
AF = '.'
INFO_x = parse_INFO(row.INFO_x)
if FORMAT_x[2] == '0':
INFO_x['QD'] = '.'
else:
pass
FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0]
# outline
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n'
outfile.write(outline)

+ 71
- 0
codescripts/merge_two_family.py Ver fichero

from __future__ import division
import pandas as pd
import sys, argparse, os
import fileinput
import re

# input arguments
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information")

parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True)
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True)
parser.add_argument('-family', '--family', type=str, help='family name', required=True)


args = parser.parse_args()
lcl5 = args.LCL5
lcl6 = args.LCL6
family = args.family


# output file
family_name = family + '.txt'

family_file = open(family_name,'w')

# input files
lcl5_dat = pd.read_table(lcl5)
lcl6_dat = pd.read_table(lcl6)

merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])

def alt_seq(alt, genotype):
if genotype == './.':
seq = './.'
elif genotype == '0/0':
seq = '0/0'
else:
alt = alt.split(',')
genotype = genotype.split('/')
if genotype[0] == '0':
allele2 = alt[int(genotype[1]) - 1]
seq = '0/' + allele2
else:
allele1 = alt[int(genotype[0]) - 1]
allele2 = alt[int(genotype[1]) - 1]
seq = allele1 + '/' + allele2
return seq

for row in merged_df.itertuples():
# correction of multiallele
if pd.isnull(row.INFO_x) == True or pd.isnull(row.INFO_y) == True:
mendelian = '.'
else:
lcl5_seq = alt_seq(row.ALT_x, row.CHILD_x)
lcl6_seq = alt_seq(row.ALT_y, row.CHILD_y)
if lcl5_seq == lcl6_seq:
mendelian = '1'
else:
mendelian = '0'
if pd.isnull(row.INFO_x) == True:
mendelian = mendelian + ':.'
else:
mendelian = mendelian + ':' + row.INFO_x.split('=')[1]
if pd.isnull(row.INFO_y) == True:
mendelian = mendelian + ':.'
else:
mendelian = mendelian + ':' + row.INFO_y.split('=')[1]


outline = row._1 + '\t' + str(row.POS) + '\t' + mendelian + '\n'
family_file.write(outline)

+ 109
- 0
codescripts/oneClass.py Ver fichero

# import modules
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import preprocessing
import sys, argparse, os
from vcf2bed import position_to_bed,padding_region



parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")

parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True)
parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True)
parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True)

args = parser.parse_args()

# Rename input:
train_input = args.trainDataset
test_input = args.testDataset
sample_name = args.sampleName

# default columns, which will be included in the included in the calssifier
chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY']
feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']


# import datasets sepearate the records with or without BaseQRankSum annotation, etc.
def load_dat(dat_file_name):
dat = pd.read_table(dat_file_name)
dat['DB'] = dat['DB'].fillna(0)
dat = dat[dat['DP'] != 0]
dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
homo_rows = dat[dat['BaseQRankSum'].isnull()]
heter_rows = dat[dat['BaseQRankSum'].notnull()]
return homo_rows,heter_rows


train_homo,train_heter = load_dat(train_input)
test_homo,test_heter = load_dat(test_input)
clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500)

def prepare_dat(train_dat,test_dat,feature_cols,chromo):
chr_train = train_dat[train_dat['chromo'] == chromo]
chr_test = test_dat[test_dat['chromo'] == chromo]
train_dat = chr_train.loc[:,feature_cols]
test_dat = chr_test.loc[:,feature_cols]
train_dat_scaled = preprocessing.scale(train_dat)
test_dat_scaled = preprocessing.scale(test_dat)
return chr_test,train_dat_scaled,test_dat_scaled

def oneclass(X_train,X_test,chr_test):
clf.fit(X_train)
y_pred_test = clf.predict(X_test)
test_true_dat = chr_test[y_pred_test == 1]
test_false_dat = chr_test[y_pred_test == -1]
return test_true_dat,test_false_dat

predicted_true = pd.DataFrame(columns=train_homo.columns)
predicted_false = pd.DataFrame(columns=train_homo.columns)

for chromo in chromosome:
# homo datasets
chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
predicted_true = predicted_true.append(test_true_homo)
predicted_false = predicted_false.append(test_false_homo)
# heter datasets
chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
predicted_true = predicted_true.append(test_true_heter)
predicted_false = predicted_false.append(test_false_heter)

predicted_true_filename = sample_name + '_predicted_true.txt'
predicted_false_filename = sample_name + '_predicted_false.txt'

predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)

# output the bed file and padding bed region 50bp

predicted_true_bed_filename = sample_name + '_predicted_true.bed'
predicted_false_bed_filename = sample_name + '_predicted_false.bed'
padding_filename = sample_name + '_padding.bed'

predicted_true_bed = open(predicted_true_bed_filename,'w')
predicted_false_bed = open(predicted_false_bed_filename,'w')
padding = open(padding_filename,'w')

#
for index,row in predicted_false.iterrows():
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
predicted_false_bed.write(outline_pos)
chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
padding.write(outline_pad_1)
padding.write(outline_pad_2)

for index,row in predicted_true.iterrows():
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
predicted_true_bed.write(outline_pos)



+ 144
- 0
codescripts/reformVCF.py Ver fichero

# import modules
import sys, argparse, os
import fileinput
import re

parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style")

parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True)
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True)

args = parser.parse_args()

# Rename input:
inputFile = args.familyVCF
family_name = args.familyName

# output filename
LCL5_name = family_name + '.LCL5.vcf'
LCL5file = open(LCL5_name,'w')
LCL6_name = family_name + '.LCL6.vcf'
LCL6file = open(LCL6_name,'w')
LCL7_name = family_name + '.LCL7.vcf'
LCL7file = open(LCL7_name,'w')
LCL8_name = family_name + '.LCL8.vcf'
LCL8file = open(LCL8_name,'w')
family_filename = family_name + '.vcf'
familyfile = open(family_filename,'w')

# default columns, which will be included in the included in the calssifier
vcfheader = '''##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# write VCF
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n'
LCL5file.write(vcfheader)
LCL5file.write(LCL5colname)

LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n'
LCL6file.write(vcfheader)
LCL6file.write(LCL6colname)

LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n'
LCL7file.write(vcfheader)
LCL7file.write(LCL7colname)

LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n'
LCL8file.write(vcfheader)
LCL8file.write(LCL8colname)

familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n'
familyfile.write(vcfheader)
familyfile.write(familycolname)

# reform VCF
def process(oneLine):
line = oneLine.rstrip()
strings = line.strip().split('\t')
# replace .
# LCL6 uniq
if strings[11] == '.':
strings[11] = '0/0'
strings[9] = strings[12]
strings[10] = strings[13]
else:
pass
# LCL5 uniq
if strings[14] == '.':
strings[14] = '0/0'
strings[12] = strings[9]
strings[13] = strings[10]
else:
pass
# sister
if strings[11] == strings[14]:
add_format = ":1"
else:
add_format = ":0"
# trioLCL5
if strings[15] == 'MD=1':
add_format = add_format + ":1"
else:
add_format = add_format + ":0"
# trioLCL6
if strings[7] == 'MD=1':
add_format = add_format + ":1"
else:
add_format = add_format + ":0"
# filter
if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'):
strings[6] = 'PASS'
else:
strings[6] = '.'
# output LCL5
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n'
LCL5file.write(LCL5outLine)
# output LCL6
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n'
LCL6file.write(LCL6outLine)
# output LCL7
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n'
LCL7file.write(LCL7outLine)
# output LCL8
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n'
LCL8file.write(LCL8outLine)
# output family
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n'
familyfile.write(familyoutLine)


for line in fileinput.input(inputFile):
m = re.match('^\#',line)
if m is not None:
pass
else:
process(line)


+ 36
- 0
codescripts/vcf2bed.py Ver fichero

import re

def position_to_bed(chromo,pos,ref,alt):
# snv
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF

if len(ref) == 1 and len(alt) == 1:
StartPos = int(pos) -1
EndPos = int(pos)
# deletions
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF + (reference length - alternate length)

elif len(ref) > 1 and len(alt) == 1:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(ref) - 1)
#insertions
# For insertions:
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF + (alternate length - reference length)

else:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(alt) - 1)

return chromo,StartPos,EndPos

def padding_region(chromo,pos1,pos2,padding):
StartPos1 = pos1 - padding
EndPos1 = pos1
StartPos2 = pos2
EndPos2 = pos2 + padding
return chromo,StartPos1,EndPos1,StartPos2,EndPos2

+ 295
- 0
codescripts/voted_by_vcfinfo_mendelianinfo.py Ver fichero

from __future__ import division
from glob import glob
import sys, argparse, os
import fileinput
import re
import pandas as pd
from operator import itemgetter
from collections import Counter
from itertools import islice
from numpy import *
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")

parser.add_argument('-folder', '--folder', type=str, help='directory that holds all the mendelian info', required=True)
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True)


args = parser.parse_args()
folder = args.folder
vcf = args.vcf

# input files
folder = folder + '/*.txt'
filenames = glob(folder)
dataframes = []
for filename in filenames:
dataframes.append(pd.read_table(filename,header=None))

dfs = [df.set_index([0, 1]) for df in dataframes]
merged_mendelian = pd.concat(dfs, axis=1).reset_index()
family_name = [i.split('/')[-1].replace('.txt','') for i in filenames]
columns = ['CHROM','POS'] + family_name
merged_mendelian.columns = columns

vcf_dat = pd.read_table(vcf)

merged_df = pd.merge(merged_mendelian, vcf_dat, how='outer', left_on=['CHROM','POS'], right_on = ['#CHROM','POS'])
merged_df = merged_df.fillna('nan')

vcf_header = '''##fileformat=VCFv4.2
##fileDate=20200501
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples">
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# output files
benchmark_LCL5 = open('LCL5_voted.vcf','w')
benchmark_LCL6 = open('LCL6_voted.vcf','w')
benchmark_LCL7 = open('LCL7_voted.vcf','w')
benchmark_LCL8 = open('LCL8_voted.vcf','w')

all_sample_outfile = open('all_sample_information.txt','w')

# write VCF
LCL5_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_benchmark_calls\n'
LCL6_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6_benchmark_calls\n'
LCL7_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7_benchmark_calls\n'
LCL8_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8_benchmark_calls\n'

benchmark_LCL5.write(vcf_header)
benchmark_LCL5.write(LCL5_col)
benchmark_LCL6.write(vcf_header)
benchmark_LCL6.write(LCL6_col)
benchmark_LCL7.write(vcf_header)
benchmark_LCL7.write(LCL7_col)
benchmark_LCL8.write(vcf_header)
benchmark_LCL8.write(LCL8_col)

# all info
all_info_col = 'CHROM\tPOS\tLCL5_pcr_consensus\tLCL5_pcr_free_consensus\tLCL5_mendelian_num\tLCL5_consensus_call\tLCL5_consensus_alt_seq\tLCL5_alt\tLCL5_dp\tLCL5_detected_num\tLCL6_pcr_consensus\tLCL6_pcr_free_consensus\tLCL6_mendelian_num\tLCL6_consensus_call\tLCL6_consensus_alt_seq\tLCL6_alt\tLCL6_dp\tLCL6_detected_num\tLCL7_pcr_consensus\tLCL7_pcr_free_consensus\tLCL7_mendelian_num\t LCL7_consensus_call\tLCL7_consensus_alt_seq\tLCL7_alt\tLCL7_dp\tLCL7_detected_num\tLCL8_pcr_consensus\tLCL8_pcr_free_consensus\tLCL8_mendelian_num\tLCL8_consensus_call\tLCL8_consensus_alt_seq\tLCL8_alt\tLCL8_dp\tLCL8_detected_num\n'
all_sample_outfile.write(all_info_col)

# function
def decide_by_rep(vcf_list,mendelian_list):
consensus_rep = ''
gt = [x.split(':')[0] for x in vcf_list]
# mendelian consistent?
mendelian_dict = Counter(mendelian_list)
highest_mendelian = mendelian_dict.most_common(1)
candidate_mendelian = highest_mendelian[0][0]
freq_mendelian = highest_mendelian[0][1]
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
con_loc = [i for i in range(len(mendelian_list)) if mendelian_list[i] == '1:1:1']
gt_con = itemgetter(*con_loc)(gt)
gt_num_dict = Counter(gt_con)
highest_gt = gt_num_dict.most_common(1)
candidate_gt = highest_gt[0][0]
freq_gt = highest_gt[0][1]
if (candidate_gt != './.') and (freq_gt >= 2):
consensus_rep = candidate_gt
elif (candidate_gt == './.') and (freq_gt >= 2):
consensus_rep = 'noGTInfo'
else:
consensus_rep = 'inconGT'
elif (candidate_mendelian == 'nan') and (freq_mendelian >= 2):
consensus_rep = 'noMenInfo'
else:
consensus_rep = 'inconMen'
return consensus_rep

def consensus_call(vcf_info_list,mendelian_list,alt_seq):
pcr_consensus = '.'
pcr_free_consensus = '.'
mendelian_num = '.'
consensus_call = '.'
consensus_alt_seq = '.'
# pcr
SEQ2000 = decide_by_rep(vcf_info_list[0:3],mendelian_list[0:3])
XTen_ARD = decide_by_rep(vcf_info_list[18:21],mendelian_list[18:21])
XTen_NVG = decide_by_rep(vcf_info_list[21:24],mendelian_list[21:24])
XTen_WUX = decide_by_rep(vcf_info_list[24:27],mendelian_list[24:27])
pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
pcr_sequence_dict = Counter(pcr_sequence_site)
pcr_highest_sequence = pcr_sequence_dict.most_common(1)
pcr_candidate_sequence = pcr_highest_sequence[0][0]
pcr_freq_sequence = pcr_highest_sequence[0][1]
if pcr_freq_sequence > 2:
pcr_consensus = pcr_candidate_sequence
else:
pcr_consensus = 'inconSequenceSite'
# pcr-free
T7_WGE = decide_by_rep(vcf_info_list[3:6],mendelian_list[3:6])
Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9],mendelian_list[6:9])
Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12],mendelian_list[9:12])
Nova_BRG = decide_by_rep(vcf_info_list[12:15],mendelian_list[12:15])
Nova_WUX = decide_by_rep(vcf_info_list[15:18],mendelian_list[15:18])
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
sequence_dict = Counter(sequence_site)
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 3:
pcr_free_consensus = candidate_sequence
else:
pcr_free_consensus = 'inconSequenceSite'
# net alt, dp
# alt
AD = [x.split(':')[1] for x in vcf_info_list]
ALT = [x.split(',')[1] for x in AD]
ALT = [int(x) for x in ALT]
ALL_ALT = sum(ALT)
# dp
DP = [x.split(':')[2] for x in vcf_info_list]
DP = [int(x) for x in DP]
ALL_DP = sum(DP)
# detected number
gt = [x.split(':')[0] for x in vcf_info_list]
gt = [x.replace('0/0','.') for x in gt]
gt = [x.replace('./.','.') for x in gt]
detected_num = 27 - gt.count('.')
# decide consensus calls
tag = ['inconGT','noMenInfo','inconMen','inconSequenceSite','noGTInfo']
if (pcr_consensus != '0/0') and (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag):
consensus_call = pcr_consensus
gt = [x.split(':')[0] for x in vcf_info_list]
indices = [i for i, x in enumerate(gt) if x == consensus_call]
matched_mendelian = itemgetter(*indices)(mendelian_list)
mendelian_num = matched_mendelian.count('1:1:1')
# Delete multiple alternative genotype to necessary expression
alt_gt = alt_seq.split(',')
if len(alt_gt) > 1:
allele1 = consensus_call.split('/')[0]
allele2 = consensus_call.split('/')[1]
if allele1 == '0':
allele2_seq = alt_gt[int(allele2) - 1]
consensus_alt_seq = allele2_seq
consensus_call = '0/1'
else:
allele1_seq = alt_gt[int(allele1) - 1]
allele2_seq = alt_gt[int(allele2) - 1]
if int(allele1) > int(allele2):
consensus_alt_seq = allele2_seq + ',' + allele1_seq
consensus_call = '1/2'
elif int(allele1) < int(allele2):
consensus_alt_seq = allele1_seq + ',' + allele2_seq
consensus_call = '1/2'
else:
consensus_alt_seq = allele1_seq
consensus_call = '1/1'
else:
consensus_alt_seq = alt_seq
elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
consensus_call = 'filtered'
elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')):
consensus_call = 'pcr-free-speicifc'
elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')):
consensus_call = 'pcr-speicifc'
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
consensus_call = '0/0'
else:
consensus_call = 'filtered'
return pcr_consensus, pcr_free_consensus, mendelian_num, consensus_call, consensus_alt_seq, ALL_ALT, ALL_DP, detected_num


for row in merged_df.itertuples():
mendelian_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_1_20191105,row.Quartet_DNA_BGI_T7_WGE_2_20191105,row.Quartet_DNA_BGI_T7_WGE_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_3_20170216]
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216]
lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216]
lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216]
# LCL5
LCL5_pcr_consensus, LCL5_pcr_free_consensus, LCL5_mendelian_num, LCL5_consensus_call, LCL5_consensus_alt_seq, LCL5_alt, LCL5_dp, LCL5_detected_num = consensus_call(lcl5_list,mendelian_list,row.ALT)
if LCL5_mendelian_num != '.':
LCL5_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL5_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL5_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL5_consensus_call + ':' + str(LCL5_alt) + ':' + str(LCL5_dp) + '\n'
benchmark_LCL5.write(LCL5_output)
# LCL6
LCL6_pcr_consensus, LCL6_pcr_free_consensus, LCL6_mendelian_num, LCL6_consensus_call, LCL6_consensus_alt_seq, LCL6_alt, LCL6_dp, LCL6_detected_num = consensus_call(lcl6_list,mendelian_list,row.ALT)
if LCL6_mendelian_num != '.':
LCL6_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL6_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL6_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL6_consensus_call + ':' + str(LCL6_alt) + ':' + str(LCL6_dp) + '\n'
benchmark_LCL6.write(LCL6_output)
# LCL7
LCL7_pcr_consensus, LCL7_pcr_free_consensus, LCL7_mendelian_num, LCL7_consensus_call, LCL7_consensus_alt_seq, LCL7_alt, LCL7_dp, LCL7_detected_num = consensus_call(lcl7_list,mendelian_list,row.ALT)
if LCL7_mendelian_num != '.':
LCL7_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL7_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL7_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL7_consensus_call + ':' + str(LCL7_alt) + ':' + str(LCL7_dp) + '\n'
benchmark_LCL7.write(LCL7_output)
# LCL8
LCL8_pcr_consensus, LCL8_pcr_free_consensus, LCL8_mendelian_num, LCL8_consensus_call, LCL8_consensus_alt_seq, LCL8_alt, LCL8_dp, LCL8_detected_num = consensus_call(lcl8_list,mendelian_list,row.ALT)
if LCL8_mendelian_num != '.':
LCL8_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL8_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL8_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL8_consensus_call + ':' + str(LCL8_alt) + ':' + str(LCL8_dp) + '\n'
benchmark_LCL8.write(LCL8_output)
# all data
all_output = row.CHROM + '\t' + str(row.POS) + '\t' + LCL5_pcr_consensus + '\t' + LCL5_pcr_free_consensus + '\t' + str(LCL5_mendelian_num) + '\t' + LCL5_consensus_call + '\t' + LCL5_consensus_alt_seq + '\t' + str(LCL5_alt) + '\t' + str(LCL5_dp) + '\t' + str(LCL5_detected_num) + '\t' +\
LCL6_pcr_consensus + '\t' + LCL6_pcr_free_consensus + '\t' + str(LCL6_mendelian_num) + '\t' + LCL6_consensus_call + '\t' + LCL6_consensus_alt_seq + '\t' + str(LCL6_alt) + '\t' + str(LCL6_dp) + '\t' + str(LCL6_detected_num) + '\t' +\
LCL7_pcr_consensus + '\t' + LCL7_pcr_free_consensus + '\t' + str(LCL7_mendelian_num) + '\t' + LCL7_consensus_call + '\t' + LCL7_consensus_alt_seq + '\t' + str(LCL7_alt) + '\t' + str(LCL7_dp) + '\t' + str(LCL7_detected_num) + '\t' +\
LCL8_pcr_consensus + '\t' + LCL8_pcr_free_consensus + '\t' + str(LCL8_mendelian_num) + '\t' + LCL8_consensus_call + '\t' + LCL8_consensus_alt_seq + '\t' + str(LCL8_alt) + '\t' + str(LCL8_dp) + '\t' + str(LCL8_detected_num) + '\n'
all_sample_outfile.write(all_output)


+ 1
- 0
family.tsv Ver fichero

#LCL5vcf LCL6vcf LCL7vcf LCL8vcf LCL5name LCL6name LCL7name LCL8name familyname

+ 8
- 0
inputs Ver fichero

{
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.disk_size": "150",
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}",
"{{ project_name }}.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.cluster_config": "OnDemand bcs.b4.xlarge img-ubuntu-vpc",
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/"
}

BIN
pictures/.DS_Store Ver fichero


BIN
pictures/workflow.png Ver fichero

Antes Después
Anchura: 2178  |  Altura: 1290  |  Tamaño: 180KB

BIN
tasks/.DS_Store Ver fichero


+ 32
- 0
tasks/VCFinfo.wdl Ver fichero

task VCFinfo {
File repeat_annotated_vcf
String sample
String docker
String cluster_config
String disk_size
command <<<

python /opt/variants_quality_location_intergration.py -vcf ${repeat_annotated_vcf} -prefix ${sample}

cat ${sample}_variant_quality_location.txt | grep '#CHROM' > header

for i in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX
do
cat ${sample}_variant_quality_location.txt | grep -w $i | cat header - > ${sample}.$i.vcfInfo.txt
done


>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File extracted_info = "${sample}_variant_quality_location.txt"
Array[File] chromo_vcfInfo = glob("*.vcfInfo.txt")
}
}

+ 31
- 0
tasks/VCFrename.wdl Ver fichero

task VCFrename {
File trio_vcf_gz
File trio_vcf_idx
String mother_name
String father_name
String child_name
String family_name
String child
String docker
String cluster_config
String disk_size
command <<<
echo "MOTHER ${mother_name}.${child}
FATHER ${father_name}.${child}
CHILD ${child_name}" > rename.txt

rtg vcfannotate -i ${trio_vcf_gz} -o ${family_name}.${child}.rename.vcf.gz --relabel=rename.txt
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File rename_trio_vcf_gz = "${family_name}.${child}.rename.vcf.gz"
File rename_trio_vcf_idx = "${family_name}.${child}.rename.vcf.gz.tbi"
}
}

+ 27
- 0
tasks/bed_annotation.wdl Ver fichero

task bed_annotation {
File merged_vcf_gz
File merged_vcf_idx
File repeat_bed
String sample
String docker
String cluster_config
String disk_size
command <<<

rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf_gz} -o ${sample}.mendelian.merged.repeatAnno.vcf.gz

gunzip ${sample}.mendelian.merged.repeatAnno.vcf.gz

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File repeat_annotated_vcf = "${sample}.mendelian.merged.repeatAnno.vcf"
}
}

+ 26
- 0
tasks/final_result.wdl Ver fichero

task FinalResult {
File extracted_info
File annotated_txt
String prefix = basename(annotated_txt,".mendelian.txt")
String sample
String docker
String cluster_config
String disk_size
command <<<

python /opt/FinalResult2VCF.py -vcfInfo ${extracted_info} -mendelianInfo ${annotated_txt} -prefix ${prefix} -sample ${sample}

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File benchmarking_calls = "${prefix}_benchmarking_calls.vcf"
File all_info = "${prefix}_all_sample_information.vcf"
}
}

+ 34
- 0
tasks/mendelian.wdl Ver fichero

task mendelian {
File child_vcf
File LCL7_vcf
File LCL8_vcf
String LCL7_name
String LCL8_name
String child_name
File ref_dir
String fasta
String docker
String cluster_config
String disk_size
command <<<
export LD_LIBRARY_PATH=/opt/htslib-1.9
nt=$(nproc)
mkdir VBT

/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${child_vcf} -outDir VBT -out-prefix ${child_name}.family --output-violation-regions -thread-count $nt

cat VBT/${child_name}.family_trio.vcf > ${child_name}.family.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
Array[File] vbt_mendelian = glob("VBT/*")
File trio_vcf = "${child_name}.family.vcf"
}
}

+ 28
- 0
tasks/merge.wdl Ver fichero

task merge {
Array[File] family_vcf_gz
Array[File] family_vcf_idx
String sample
String docker
String cluster_config
String disk_size
command <<<

rtg vcfmerge --force-merge-all -o ${sample}.merged.vcf.gz ${sep=" " family_vcf_gz}

zcat ${sample}.merged.vcf.gz | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File merged_vcf_gz = "${sample}.merged.vcf.gz"
File merged_vcf_idx = "${sample}.merged.vcf.gz.tbi"
File vcf_dup = "${sample}.vcf_dup.txt"
}
}

+ 34
- 0
tasks/mergeSister.wdl Ver fichero

task mergeSister {
File LCL5_trio_vcf_gz
File LCL5_trio_vcf_idx
File LCL6_trio_vcf_gz
File LCL6_trio_vcf_idx
String family_name
String docker
String cluster_config
String disk_size
command <<<
rtg vcfmerge -o LCL5.LCL6.merged.vcf.gz ${LCL5_trio_vcf_gz} ${LCL6_trio_vcf_gz}

rtg vcfmerge -o LCL6.LCL5.merged.vcf.gz ${LCL6_trio_vcf_gz} ${LCL5_trio_vcf_gz}

zcat LCL5.LCL6.merged.vcf.gz | grep '##' > header
zcat LCL5.LCL6.merged.vcf.gz | grep -v '##' | cut -f8 > LCL5.mendelian
zcat LCL6.LCL5.merged.vcf.gz | grep -v '##' | paste - LCL5.mendelian > body

cat header body > ${family_name}.trio.info.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File family_mendelian_info = "${family_name}.trio.info.vcf"
}

}

+ 25
- 0
tasks/mergeVCFInfo.wdl Ver fichero

task mergeVCFInfo {
Array[File] vcf_gz
Array[File] vcf_idx
String sample
String docker
String cluster_config
String disk_size
command <<<

rtg vcfmerge --force-merge-all -o ${sample}.merged.info.vcf.gz ${sep=" " vcf_gz}
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File merged_vcf = "${sample}.merged.info.vcf.gz"
File merged_vcf_idx = "${sample}.merged.info.vcf.gz.tbi"
}
}

+ 24
- 0
tasks/merge_info.wdl Ver fichero

task merge_info {
File vcfInfo
File mendelianInfo
String sample
String docker
String cluster_config
String disk_size
command <<<

python /opt/merge_mendelian_vcfinfo.py -vcfInfo ${vcfInfo} -mendelianInfo ${mendelianInfo} -sample ${sample}

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File all_info = "${sample}_mendelian_vcfInfo.vcf"
}
}

+ 39
- 0
tasks/oneClass.wdl Ver fichero

task oneClass {
File snv_train_vcf
File snv_test_vcf
File indel_train_vcf
File indel_test_vcf
String sampleName = basename(snv_train_vcf,".normed.snv.train.txt")
String docker
String cluster_config
String disk_size
command <<<

python /opt/oneClass.py -train ${snv_train_vcf} -test ${snv_test_vcf} -name ${sampleName}_snv

python /opt/oneClass.py -train ${indel_train_vcf} -test ${indel_test_vcf} -name ${sampleName}_indel

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File snv_true_txt = "${sampleName}_snv_predicted_true.txt"
File snv_false_txt = "${sampleName}_snv_predicted_false.txt"
File snv_true_bed = "${sampleName}_snv_predicted_true.bed"
File snv_false_bed = "${sampleName}_snv_predicted_false.bed"
File snv_padding = "${sampleName}_snv_padding.bed"
File indel_true_txt = "${sampleName}_indel_predicted_true.txt"
File indel_false_txt = "${sampleName}_indel_predicted_false.txt"
File indel_true_bed = "${sampleName}_indel_predicted_true.bed"
File indel_false_bed = "${sampleName}_indel_predicted_false.bed"
File indel_padding = "${sampleName}_indel_padding.bed"
}
}


+ 38
- 0
tasks/reformVCF.wdl Ver fichero

task reformVCF {
File family_mendelian_info
File family_name
String docker
String cluster_config
String disk_size
command <<<

python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name}

cat ${family_name}.LCL5.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL5.txt
cat ${family_name}.LCL6.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL6.txt
cat ${family_name}.LCL7.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL7.txt
cat ${family_name}.LCL8.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL8.txt

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File LCL5_family_info = "${family_name}.LCL5.vcf"
File LCL6_family_info = "${family_name}.LCL6.vcf"
File LCL7_family_info = "${family_name}.LCL7.vcf"
File LCL8_family_info = "${family_name}.LCL8.vcf"
File family_info = "${family_name}.vcf"
File LCL5_family_info_txt = "${family_name}.LCL5.txt"
File LCL6_family_info_txt = "${family_name}.LCL6.txt"
File LCL7_family_info_txt = "${family_name}.LCL7.txt"
File LCL8_family_info_txt = "${family_name}.LCL8.txt"
}
}


+ 51
- 0
tasks/sister.wdl Ver fichero

task sister {
File LCL5_vcf
File LCL6_vcf
File LCL7_vcf
File LCL8_vcf
File ref_dir
String LCL5_name
String LCL6_name
String LCL7_name
String LCL8_name
String fasta
String family_name
String docker
String cluster_config
String disk_size
command <<<
export LD_LIBRARY_PATH=/opt/htslib-1.9
nt=$(nproc)
cat ${LCL6_vcf} | grep -v '##' | cut -f10 > D6
cat ${LCL7_vcf} | grep -v '##' | cut -f10 > F7
cat ${LCL8_vcf} | grep -v '##' | cut -f10 > M8
cat ${LCL5_vcf} | grep -v '##' | paste - D6 F7 M8 > body
cat ${LCL5_vcf} | grep '##' | cat - body > ${family_name}.vcf
# prepare ped file, D5
echo "${family_name} ${LCL8_name} 0 0 2 -9
${family_name} ${LCL7_name} 0 0 1 -9
${family_name} ${LCL5_name} ${LCL7_name} ${LCL8_name} 2 -9" > ${family_name}.D5.ped

mkdir D5
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_name}.vcf -father ${family_name}.vcf -child ${family_name}.vcf -pedigree ${family_name}.D5.ped -outDir D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt
# prepare ped file, D6
echo "${family_name} ${LCL8_name} 0 0 2 -9
${family_name} ${LCL7_name} 0 0 1 -9
${family_name} ${LCL6_name} ${LCL7_name} ${LCL8_name} 2 -9" > ${family_name}.D6.ped

mkdir D6
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_name}.vcf -father ${family_name}.vcf -child ${family_name}.vcf -pedigree ${family_name}.D6.ped -outDir D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
Array[File] D5_mendelian = glob("${family_name}.D5/*")
Array[File] D6_mendelian = glob("${family_name}.D6/*")
}
}

+ 26
- 0
tasks/two_family_merge.wdl Ver fichero

task two_family_merge {
File LCL5_trio_vcf
File LCL6_trio_vcf
String family_name
String docker
String cluster_config
String disk_size
command <<<
cat ${LCL5_trio_vcf} | grep -v '##' > ${family_name}.LCL5.txt
cat ${LCL6_trio_vcf} | grep -v '##' > ${family_name}.LCL6.txt
python /opt/merge_two_family.py -LCL5 ${family_name}.LCL5.txt -LCL6 ${family_name}.LCL6.txt -family ${family_name}
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File family_mendelian_info = "${family_name}.txt"
}

}

+ 33
- 0
tasks/variantsNorm.wdl Ver fichero

task variantsNorm {
File vcf
File ref_dir
String fasta
String sampleName
String docker
String cluster_config
String disk_size
command <<<

cat ${vcf} | grep '#' > header
cat ${vcf} | grep -v '#' > body
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered
cat header body.filtered > ${sampleName}.filtered.vcf

/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf

cat ${sampleName}.normed.vcf | grep -v '##' > ${sampleName}.normed.txt

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File normed_vcf = "${sampleName}.normed.vcf"
File normed_txt = "${sampleName}.normed.txt"
}
}

+ 41
- 0
tasks/votes.wdl Ver fichero

task votes {
Array[File] family_mendelian_info
File vcf
String chromo
String docker
String cluster_config
String disk_size
command <<<
mkdir temp
for i in ${sep=" " family_mendelian_info}
do
cp $i temp
done

cat ${vcf} | grep -v '##' > vcf_info.txt

python /opt/voted_by_vcfinfo_mendelianinfo.py -folder ./temp -vcf vcf_info.txt

cp LCL5_voted.vcf LCL5.${chromo}.voted.vcf
cp LCL6_voted.vcf LCL6.${chromo}.voted.vcf
cp LCL7_voted.vcf LCL7.${chromo}.voted.vcf
cp LCL8_voted.vcf LCL8.${chromo}.voted.vcf

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File LCL5_voted_vcf = "LCL5.${chromo}.voted.vcf"
File LCL6_voted_vcf = "LCL6.${chromo}.voted.vcf"
File LCL7_voted_vcf = "LCL7.${chromo}.voted.vcf"
File LCL8_voted_vcf = "LCL8.${chromo}.voted.vcf"
File all_sample_info = "all_sample_information.txt"
}
}


+ 24
- 0
tasks/zipIndex.wdl Ver fichero

task zipIndex {
File vcf
String vcf_name = basename(vcf,".vcf")
String docker
String cluster_config
String disk_size
command <<<
rtg bgzip ${vcf} -c > ${vcf_name}.vcf.gz
rtg index -f vcf ${vcf_name}.vcf.gz

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File vcf_gz = "${vcf_name}.vcf.gz"
File vcf_idx = "${vcf_name}.vcf.gz.tbi"
}
}

+ 31
- 0
workflow.wdl Ver fichero

import "./tasks/sister.wdl" as sister

workflow {{ project_name }} {
File inputSamplesFile
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile)
File ref_dir
String docker
String fasta
String cluster_config
String disk_size

scatter (quartet in inputSamples){
call sister.sister as sister {
input:
LCL5_vcf=quartet[0],
LCL6_vcf=quartet[1],
LCL7_vcf=quartet[2],
LCL8_vcf=quartet[3],
ref_dir=ref_dir,
LCL5_name=quartet[4],
LCL6_name=quartet[5],
LCL7_name=quartet[6],
LCL8_name=quartet[7],
fasta=fasta,
family_name=quartet[8],
docker=docker,
cluster_config=cluster_config,
disk_size=disk_size
}
}
}

Cargando…
Cancelar
Guardar