Bladeren bron

vcf info

master
LUYAO REN 5 jaren geleden
bovenliggende
commit
bac3d92813
7 gewijzigde bestanden met toevoegingen van 210 en 13 verwijderingen
  1. +1
    -2
      codescripts/high_confidence_call_vote.py
  2. +78
    -0
      codescripts/variants_quality_location_intergration.py
  3. +52
    -0
      codescripts/vcf_mq_af.py
  4. +4
    -0
      inputs
  5. +38
    -1
      tasks/bed_annotation.wdl
  6. +4
    -6
      tasks/extract_info.wdl
  7. +33
    -4
      workflow.wdl

+ 1
- 2
codescripts/high_confidence_call_vote.py Bestand weergeven

# import modules
from __future__ import division
import sys, argparse, os import sys, argparse, os
import fileinput import fileinput
import re import re
from operator import itemgetter from operator import itemgetter
from collections import Counter from collections import Counter
from itertools import islice from itertools import islice
from __future__ import division


# input arguments # input arguments
parser = argparse.ArgumentParser(description="this script is to count voting number") parser = argparse.ArgumentParser(description="this script is to count voting number")

+ 78
- 0
codescripts/variants_quality_location_intergration.py Bestand weergeven

import sys, argparse, os
import fileinput
import re
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to intergeate vcf information, variants quality and location")

parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
multi_sample_vcf = args.multi_sample_vcf
prefix = args.prefix


def get_location(info):
repeat = ''
if 'ANN' in info:
strings = info.strip().split(';')
for element in strings:
m = re.match('ANN',element)
if m is not None:
repeat = element.split('=')[1]
else:
repeat = '.'
return repeat


def extract_info_normal(strings):
AF = []
GQ = []
MQ = []
DP = []
ALT = []
for element in strings:
if element == '.':
pass
else:
ad = element.split(':')[1]
ref = ad.split(',')[0]
alt = ad.split(',')[1]
af = float(int(alt)/(int(ref) + int(alt)))
gq = int(element.split(':')[3])
mq = float(element.split(':')[5])
dp = int(element.split(':')[2])
AF.append(af)
GQ.append(gq)
MQ.append(mq)
DP.append(dp)
ALT.append(int(alt))
AF_m = statistics.mean(AF)
GQ_m = statistics.mean(GQ)
MQ_m = statistics.mean(MQ)
DP_a = sum(DP)
ALT_a = sum(ALT)
return AF_m,GQ_m,MQ_m,DP_a,ALT_a


file_name = prefix + '_variant_quality_location.txt'
outfile = open(file_name,'w')
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ 'location' + '\t' + 'AF' + '\t' + 'GQ' + '\t' + 'MQ' + '\t' + 'DP' + '\t' + 'ALT' +'\n'
outfile.write(outputcolumn)
for line in fileinput.input(multi_sample_vcf):
m = re.match('^\#',line)
if m is not None:
pass
else:
line = line.strip()
strings = line.split('\t')
repeat = get_location(strings[7])
AF,GQ,MQ,DP,ALT = extract_info_normal(strings[9:])
outLine = '\t'.join(strings) + '\t' + repeat +'\t' + str(AF) + '\t' + str(GQ) + '\t' + str(MQ) + '\t' + str(DP) + '\t' + str(ALT) + '\n'
outfile.write(outLine)



+ 52
- 0
codescripts/vcf_mq_af.py Bestand weergeven

from __future__ import division
import sys, argparse, os
import fileinput
import re
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to get mapping quality, allele frequency and alternative depth")

parser.add_argument('-vcf', '--normed_vcf', type=str, help='The VCF file you want to used', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
normed_vcf = args.normed_vcf
prefix = args.prefix


file_name = prefix + '.variant_quality_location.vcf'
outfile = open(file_name,'w')

for line in fileinput.input(normed_vcf):
m = re.match('^\#',line)
if m is not None:
outfile.write(line)
else:
line = line.strip()
strings = line.split('\t')
strings[8] = strings[8] + ':MQ:ALT:AF'
infos = strings[7].strip().split(';')
## MQ
for element in infos:
m = re.match('MQ=',element)
if m is not None:
MQ = element.split('=')[1]
## ALT
ad = strings[9].split(':')[1]
ad_single = ad.split(',')
ad_single = [int(i) for i in ad_single]
DP = sum(ad_single)
if DP != 0:
ad_single.pop(0)
ALT = sum(ad_single)
AF = ALT/DP
else:
ALT = 0
AF = 'NA'
outLine = '\t'.join(strings) + ':' + MQ + ':' + str(ALT) + ':' + str(AF) + '\n'
outfile.write(outLine)



+ 4
- 0
inputs Bestand weergeven

{ {
"{{ project_name }}.LCL6normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL6normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL8mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", "{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL5normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL7extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", "{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", "{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1",
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL5bedAnnotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL5bedAnnotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL6mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL6extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1",
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", "{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
"{{ project_name }}.LCL7mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL7mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", "{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",

+ 38
- 1
tasks/bed_annotation.wdl Bestand weergeven

command <<< command <<<


rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz
## DP
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[3];
}
}
{ print }
' > ${sample}.depth.txt
## GQ
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[4];
}
}
{ print }
' > ${sample}.genotypeQuality.txt

## MQ
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[6];
}
}
{ print }
' > ${sample}.mappinyQuality.txt

## Allele frequency



>>> >>>


} }
output { output {
File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf.gz" File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf.gz"
File repeat_annotated_vcf_idx = "${sample}.normed.repeatAnno.vcf.gz.tbi"
} }
} }

+ 4
- 6
tasks/extract_info.wdl Bestand weergeven

task extract_info { task extract_info {
File vcf
String vcf_name = basename(vcf,".vcf")
File normed_vcf
String sampleName
String docker String docker
String cluster_config String cluster_config
String disk_size String disk_size
command <<< command <<<


python /opt/extract_vcf_information.py -i ${vcf} -o ${vcf_name}.txt
cat ${vcf_name}.txt | cut -f23,25,27,22,12,21,3,18,4,8,11,15 > ${vcf_name}.essential.txt
python /opt/vcf_mq_af.py -vcf ${normed_vcf} -prefix ${sampleName}


>>> >>>


dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
} }
output { output {
File vcf_info = "${vcf_name}.txt"
File vcf_needed_info = "${vcf_name}.essential.txt"
File vcf_info = "${sampleName}.variant_quality_location.vcf"
} }
} }

+ 33
- 4
workflow.wdl Bestand weergeven

import "./tasks/variantsNorm.wdl" as variantsNorm import "./tasks/variantsNorm.wdl" as variantsNorm
import "./tasks/extract_info.wdl" as extract_info
import "./tasks/mendelian.wdl" as mendelian import "./tasks/mendelian.wdl" as mendelian
import "./tasks/zipIndex.wdl" as zipIndex import "./tasks/zipIndex.wdl" as zipIndex
import "./tasks/VCFrename.wdl" as VCFrename import "./tasks/VCFrename.wdl" as VCFrename
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }
call extract_info.extract_info as LCL5extract_info {
input:
normed_vcf=LCL5variantsNorm.normed_vcf,
sampleName=quartet[4],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL6extract_info {
input:
normed_vcf=LCL6variantsNorm.normed_vcf,
sampleName=quartet[5],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL7extract_info {
input:
normed_vcf=LCL7variantsNorm.normed_vcf,
sampleName=quartet[6],
cluster_config=cluster_config,
disk_size=disk_size
}
call extract_info.extract_info as LCL8extract_info {
input:
normed_vcf=LCL8variantsNorm.normed_vcf,
sampleName=quartet[8],
cluster_config=cluster_config,
disk_size=disk_size
}
call zipIndex.zipIndex as LCL5normZip{ call zipIndex.zipIndex as LCL5normZip{
input: input:
vcf=LCL5variantsNorm.normed_mq_vcf,
vcf=LCL5extract_info.vcf_info,
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }
call zipIndex.zipIndex as LCL6normZip{ call zipIndex.zipIndex as LCL6normZip{
input: input:
vcf=LCL6variantsNorm.normed_mq_vcf,
vcf=LCL6extract_info.vcf_info,
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }
call zipIndex.zipIndex as LCL7normZip{ call zipIndex.zipIndex as LCL7normZip{
input: input:
vcf=LCL7variantsNorm.normed_mq_vcf,
vcf=LCL7extract_info.vcf_info,
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }
call zipIndex.zipIndex as LCL8normZip{ call zipIndex.zipIndex as LCL8normZip{
input: input:
vcf=LCL8variantsNorm.normed_mq_vcf,
vcf=LCL8extract_info.vcf_info,
cluster_config=cluster_config, cluster_config=cluster_config,
disk_size=disk_size disk_size=disk_size
} }

Laden…
Annuleren
Opslaan