浏览代码

vcf info

master
LUYAO REN 5 年前
父节点
当前提交
bac3d92813
共有 7 个文件被更改,包括 210 次插入13 次删除
  1. +1
    -2
      codescripts/high_confidence_call_vote.py
  2. +78
    -0
      codescripts/variants_quality_location_intergration.py
  3. +52
    -0
      codescripts/vcf_mq_af.py
  4. +4
    -0
      inputs
  5. +38
    -1
      tasks/bed_annotation.wdl
  6. +4
    -6
      tasks/extract_info.wdl
  7. +33
    -4
      workflow.wdl

+ 1
- 2
codescripts/high_confidence_call_vote.py 查看文件

# import modules
from __future__ import division
import sys, argparse, os import sys, argparse, os
import fileinput import fileinput
import re import re
from operator import itemgetter from operator import itemgetter
from collections import Counter from collections import Counter
from itertools import islice from itertools import islice
from __future__ import division


# input arguments # input arguments
parser = argparse.ArgumentParser(description="this script is to count voting number") parser = argparse.ArgumentParser(description="this script is to count voting number")

+ 78
- 0
codescripts/variants_quality_location_intergration.py 查看文件

import sys, argparse, os
import fileinput
import re
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to intergeate vcf information, variants quality and location")

parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
multi_sample_vcf = args.multi_sample_vcf
prefix = args.prefix


def get_location(info):
repeat = ''
if 'ANN' in info:
strings = info.strip().split(';')
for element in strings:
m = re.match('ANN',element)
if m is not None:
repeat = element.split('=')[1]
else:
repeat = '.'
return repeat


def extract_info_normal(strings):
AF = []
GQ = []
MQ = []
DP = []
ALT = []
for element in strings:
if element == '.':
pass
else:
ad = element.split(':')[1]
ref = ad.split(',')[0]
alt = ad.split(',')[1]
af = float(int(alt)/(int(ref) + int(alt)))
gq = int(element.split(':')[3])
mq = float(element.split(':')[5])
dp = int(element.split(':')[2])
AF.append(af)
GQ.append(gq)
MQ.append(mq)
DP.append(dp)
ALT.append(int(alt))
AF_m = statistics.mean(AF)
GQ_m = statistics.mean(GQ)
MQ_m = statistics.mean(MQ)
DP_a = sum(DP)
ALT_a = sum(ALT)
return AF_m,GQ_m,MQ_m,DP_a,ALT_a


file_name = prefix + '_variant_quality_location.txt'
outfile = open(file_name,'w')
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ 'location' + '\t' + 'AF' + '\t' + 'GQ' + '\t' + 'MQ' + '\t' + 'DP' + '\t' + 'ALT' +'\n'
outfile.write(outputcolumn)
for line in fileinput.input(multi_sample_vcf):
m = re.match('^\#',line)
if m is not None:
pass
else:
line = line.strip()
strings = line.split('\t')
repeat = get_location(strings[7])
AF,GQ,MQ,DP,ALT = extract_info_normal(strings[9:])
outLine = '\t'.join(strings) + '\t' + repeat +'\t' + str(AF) + '\t' + str(GQ) + '\t' + str(MQ) + '\t' + str(DP) + '\t' + str(ALT) + '\n'
outfile.write(outLine)



+ 52
- 0
codescripts/vcf_mq_af.py 查看文件

from __future__ import division
import sys, argparse, os
import fileinput
import re
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to get mapping quality, allele frequency and alternative depth")

parser.add_argument('-vcf', '--normed_vcf', type=str, help='The VCF file you want to used', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
normed_vcf = args.normed_vcf
prefix = args.prefix


file_name = prefix + '.variant_quality_location.vcf'
outfile = open(file_name,'w')

for line in fileinput.input(normed_vcf):
m = re.match('^\#',line)
if m is not None:
outfile.write(line)
else:
line = line.strip()
strings = line.split('\t')
strings[8] = strings[8] + ':MQ:ALT:AF'
infos = strings[7].strip().split(';')
## MQ
for element in infos:
m = re.match('MQ=',element)
if m is not None:
MQ = element.split('=')[1]
## ALT
ad = strings[9].split(':')[1]
ad_single = ad.split(',')
ad_single = [int(i) for i in ad_single]
DP = sum(ad_single)
if DP != 0:
ad_single.pop(0)
ALT = sum(ad_single)
AF = ALT/DP
else:
ALT = 0
AF = 'NA'
outLine = '\t'.join(strings) + ':' + MQ + ':' + str(ALT) + ':' + str(AF) + '\n'
outfile.write(outLine)



+ 4
- 0
inputs 查看文件

{ {
"{{ project_name }}.LCL6normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL6normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL8mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", "{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL5normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL7extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", "{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", "{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1",
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5bedAnnotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL5bedAnnotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL6mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", "{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.LCL7mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL7mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",

+ 38
- 1
tasks/bed_annotation.wdl 查看文件

command <<< command <<<


rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz
## DP
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[3];
}
}
{ print }
' > ${sample}.depth.txt
## GQ
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[4];
}
}
{ print }
' > ${sample}.genotypeQuality.txt

## MQ
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[6];
}
}
{ print }
' > ${sample}.mappinyQuality.txt

## Allele frequency



>>> >>>


} }
output { output {
File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf.gz" File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf.gz"
File repeat_annotated_vcf_idx = "${sample}.normed.repeatAnno.vcf.gz.tbi"
} }
} }

+ 4
- 6
tasks/extract_info.wdl 查看文件

task extract_info { task extract_info {
File vcf
String vcf_name = basename(vcf,".vcf")
File normed_vcf
String sampleName
String docker String docker
String cluster_config String cluster_config
String disk_size String disk_size
command <<< command <<<


python /opt/extract_vcf_information.py -i ${vcf} -o ${vcf_name}.txt
cat ${vcf_name}.txt | cut -f23,25,27,22,12,21,3,18,4,8,11,15 > ${vcf_name}.essential.txt
python /opt/vcf_mq_af.py -vcf ${normed_vcf} -prefix ${sampleName}


>>> >>>


dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
} }
output { output {
File vcf_info = "${vcf_name}.txt"
File vcf_needed_info = "${vcf_name}.essential.txt"
File vcf_info = "${sampleName}.variant_quality_location.vcf"
} }
} }

+ 33
- 4
workflow.wdl 查看文件

import "./tasks/variantsNorm.wdl" as variantsNorm import "./tasks/variantsNorm.wdl" as variantsNorm
import "./tasks/extract_info.wdl" as extract_info
import "./tasks/mendelian.wdl" as mendelian import "./tasks/mendelian.wdl" as mendelian
import "./tasks/zipIndex.wdl" as zipIndex import "./tasks/zipIndex.wdl" as zipIndex
import "./tasks/VCFrename.wdl" as VCFrename import "./tasks/VCFrename.wdl" as VCFrename
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }
call extract_info.extract_info as LCL5extract_info {
input:
normed_vcf=LCL5variantsNorm.normed_vcf,
sampleName=quartet[4],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL6extract_info {
input:
normed_vcf=LCL6variantsNorm.normed_vcf,
sampleName=quartet[5],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL7extract_info {
input:
normed_vcf=LCL7variantsNorm.normed_vcf,
sampleName=quartet[6],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL8extract_info {
input:
normed_vcf=LCL8variantsNorm.normed_vcf,
sampleName=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL5normZip{ call zipIndex.zipIndex as LCL5normZip{
input: input:
vcf=LCL5variantsNorm.normed_mq_vcf,
vcf=LCL5extract_info.vcf_info,
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }
call zipIndex.zipIndex as LCL6normZip{ call zipIndex.zipIndex as LCL6normZip{
input: input:
vcf=LCL6variantsNorm.normed_mq_vcf,
vcf=LCL6extract_info.vcf_info,
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }
call zipIndex.zipIndex as LCL7normZip{ call zipIndex.zipIndex as LCL7normZip{
input: input:
vcf=LCL7variantsNorm.normed_mq_vcf,
vcf=LCL7extract_info.vcf_info,
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }
call zipIndex.zipIndex as LCL8normZip{ call zipIndex.zipIndex as LCL8normZip{
input: input:
vcf=LCL8variantsNorm.normed_mq_vcf,
vcf=LCL8extract_info.vcf_info,
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }

正在加载...
取消
保存