LUYAO REN před 5 roky
rodič
revize
bac3d92813
7 změnil soubory, kde provedl 210 přidání a 13 odebrání
  1. +1
    -2
      codescripts/high_confidence_call_vote.py
  2. +78
    -0
      codescripts/variants_quality_location_intergration.py
  3. +52
    -0
      codescripts/vcf_mq_af.py
  4. +4
    -0
      inputs
  5. +38
    -1
      tasks/bed_annotation.wdl
  6. +4
    -6
      tasks/extract_info.wdl
  7. +33
    -4
      workflow.wdl

+ 1
- 2
codescripts/high_confidence_call_vote.py Zobrazit soubor

@@ -1,4 +1,4 @@
# import modules
from __future__ import division
import sys, argparse, os
import fileinput
import re
@@ -6,7 +6,6 @@ import pandas as pd
from operator import itemgetter
from collections import Counter
from itertools import islice
from __future__ import division

# input arguments
parser = argparse.ArgumentParser(description="this script is to count voting number")

+ 78
- 0
codescripts/variants_quality_location_intergration.py Zobrazit soubor

@@ -0,0 +1,78 @@
import sys, argparse, os
import fileinput
import re
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to intergeate vcf information, variants quality and location")

parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
multi_sample_vcf = args.multi_sample_vcf
prefix = args.prefix


def get_location(info):
repeat = ''
if 'ANN' in info:
strings = info.strip().split(';')
for element in strings:
m = re.match('ANN',element)
if m is not None:
repeat = element.split('=')[1]
else:
repeat = '.'
return repeat


def extract_info_normal(strings):
AF = []
GQ = []
MQ = []
DP = []
ALT = []
for element in strings:
if element == '.':
pass
else:
ad = element.split(':')[1]
ref = ad.split(',')[0]
alt = ad.split(',')[1]
af = float(int(alt)/(int(ref) + int(alt)))
gq = int(element.split(':')[3])
mq = float(element.split(':')[5])
dp = int(element.split(':')[2])
AF.append(af)
GQ.append(gq)
MQ.append(mq)
DP.append(dp)
ALT.append(int(alt))
AF_m = statistics.mean(AF)
GQ_m = statistics.mean(GQ)
MQ_m = statistics.mean(MQ)
DP_a = sum(DP)
ALT_a = sum(ALT)
return AF_m,GQ_m,MQ_m,DP_a,ALT_a


file_name = prefix + '_variant_quality_location.txt'
outfile = open(file_name,'w')
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ 'location' + '\t' + 'AF' + '\t' + 'GQ' + '\t' + 'MQ' + '\t' + 'DP' + '\t' + 'ALT' +'\n'
outfile.write(outputcolumn)
for line in fileinput.input(multi_sample_vcf):
m = re.match('^\#',line)
if m is not None:
pass
else:
line = line.strip()
strings = line.split('\t')
repeat = get_location(strings[7])
AF,GQ,MQ,DP,ALT = extract_info_normal(strings[9:])
outLine = '\t'.join(strings) + '\t' + repeat +'\t' + str(AF) + '\t' + str(GQ) + '\t' + str(MQ) + '\t' + str(DP) + '\t' + str(ALT) + '\n'
outfile.write(outLine)



+ 52
- 0
codescripts/vcf_mq_af.py Zobrazit soubor

@@ -0,0 +1,52 @@
from __future__ import division
import sys, argparse, os
import fileinput
import re
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to get mapping quality, allele frequency and alternative depth")

parser.add_argument('-vcf', '--normed_vcf', type=str, help='The VCF file you want to used', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
normed_vcf = args.normed_vcf
prefix = args.prefix


file_name = prefix + '.variant_quality_location.vcf'
outfile = open(file_name,'w')

for line in fileinput.input(normed_vcf):
m = re.match('^\#',line)
if m is not None:
outfile.write(line)
else:
line = line.strip()
strings = line.split('\t')
strings[8] = strings[8] + ':MQ:ALT:AF'
infos = strings[7].strip().split(';')
## MQ
for element in infos:
m = re.match('MQ=',element)
if m is not None:
MQ = element.split('=')[1]
## ALT
ad = strings[9].split(':')[1]
ad_single = ad.split(',')
ad_single = [int(i) for i in ad_single]
DP = sum(ad_single)
if DP != 0:
ad_single.pop(0)
ALT = sum(ad_single)
AF = ALT/DP
else:
ALT = 0
AF = 'NA'
outLine = '\t'.join(strings) + ':' + MQ + ':' + str(ALT) + ':' + str(AF) + '\n'
outfile.write(outLine)



+ 4
- 0
inputs Zobrazit soubor

@@ -1,5 +1,6 @@
{
"{{ project_name }}.LCL6normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
@@ -21,9 +22,11 @@
"{{ project_name }}.LCL8mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL7extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1",
@@ -32,6 +35,7 @@
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5bedAnnotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.LCL7mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",

+ 38
- 1
tasks/bed_annotation.wdl Zobrazit soubor

@@ -9,7 +9,43 @@ task bed_annotation {
command <<<

rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz
## DP
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[3];
}
}
{ print }
' > ${sample}.depth.txt
## GQ
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[4];
}
}
{ print }
' > ${sample}.genotypeQuality.txt

## MQ
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[6];
}
}
{ print }
' > ${sample}.mappinyQuality.txt

## Allele frequency


>>>

@@ -21,5 +57,6 @@ task bed_annotation {
}
output {
File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf.gz"
File repeat_annotated_vcf_idx = "${sample}.normed.repeatAnno.vcf.gz.tbi"
}
}

+ 4
- 6
tasks/extract_info.wdl Zobrazit soubor

@@ -1,14 +1,13 @@
task extract_info {
File vcf
String vcf_name = basename(vcf,".vcf")
File normed_vcf
String sampleName
String docker
String cluster_config
String disk_size
command <<<

python /opt/extract_vcf_information.py -i ${vcf} -o ${vcf_name}.txt
cat ${vcf_name}.txt | cut -f23,25,27,22,12,21,3,18,4,8,11,15 > ${vcf_name}.essential.txt
python /opt/vcf_mq_af.py -vcf ${normed_vcf} -prefix ${sampleName}

>>>

@@ -19,7 +18,6 @@ task extract_info {
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File vcf_info = "${vcf_name}.txt"
File vcf_needed_info = "${vcf_name}.essential.txt"
File vcf_info = "${sampleName}.variant_quality_location.vcf"
}
}

+ 33
- 4
workflow.wdl Zobrazit soubor

@@ -1,4 +1,5 @@
import "./tasks/variantsNorm.wdl" as variantsNorm
import "./tasks/extract_info.wdl" as extract_info
import "./tasks/mendelian.wdl" as mendelian
import "./tasks/zipIndex.wdl" as zipIndex
import "./tasks/VCFrename.wdl" as VCFrename
@@ -55,27 +56,55 @@ workflow {{ project_name }} {
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL5extract_info {
input:
normed_vcf=LCL5variantsNorm.normed_vcf,
sampleName=quartet[4],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL6extract_info {
input:
normed_vcf=LCL6variantsNorm.normed_vcf,
sampleName=quartet[5],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL7extract_info {
input:
normed_vcf=LCL7variantsNorm.normed_vcf,
sampleName=quartet[6],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL8extract_info {
input:
normed_vcf=LCL8variantsNorm.normed_vcf,
sampleName=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL5normZip{
input:
vcf=LCL5variantsNorm.normed_mq_vcf,
vcf=LCL5extract_info.vcf_info,
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL6normZip{
input:
vcf=LCL6variantsNorm.normed_mq_vcf,
vcf=LCL6extract_info.vcf_info,
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL7normZip{
input:
vcf=LCL7variantsNorm.normed_mq_vcf,
vcf=LCL7extract_info.vcf_info,
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL8normZip{
input:
vcf=LCL8variantsNorm.normed_mq_vcf,
vcf=LCL8extract_info.vcf_info,
cluster_config=cluster_config,
disk_size=disk_size
}

Načítá se…
Zrušit
Uložit