# import modules | |||||
from __future__ import division | |||||
import sys, argparse, os | import sys, argparse, os | ||||
import fileinput | import fileinput | ||||
import re | import re | ||||
from operator import itemgetter | from operator import itemgetter | ||||
from collections import Counter | from collections import Counter | ||||
from itertools import islice | from itertools import islice | ||||
from __future__ import division | |||||
# input arguments | # input arguments | ||||
parser = argparse.ArgumentParser(description="this script is to count voting number") | parser = argparse.ArgumentParser(description="this script is to count voting number") |
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import statistics | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to intergeate vcf information, variants quality and location") | |||||
parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True) | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||||
args = parser.parse_args() | |||||
multi_sample_vcf = args.multi_sample_vcf | |||||
prefix = args.prefix | |||||
def get_location(info): | |||||
repeat = '' | |||||
if 'ANN' in info: | |||||
strings = info.strip().split(';') | |||||
for element in strings: | |||||
m = re.match('ANN',element) | |||||
if m is not None: | |||||
repeat = element.split('=')[1] | |||||
else: | |||||
repeat = '.' | |||||
return repeat | |||||
def extract_info_normal(strings): | |||||
AF = [] | |||||
GQ = [] | |||||
MQ = [] | |||||
DP = [] | |||||
ALT = [] | |||||
for element in strings: | |||||
if element == '.': | |||||
pass | |||||
else: | |||||
ad = element.split(':')[1] | |||||
ref = ad.split(',')[0] | |||||
alt = ad.split(',')[1] | |||||
af = float(int(alt)/(int(ref) + int(alt))) | |||||
gq = int(element.split(':')[3]) | |||||
mq = float(element.split(':')[5]) | |||||
dp = int(element.split(':')[2]) | |||||
AF.append(af) | |||||
GQ.append(gq) | |||||
MQ.append(mq) | |||||
DP.append(dp) | |||||
ALT.append(int(alt)) | |||||
AF_m = statistics.mean(AF) | |||||
GQ_m = statistics.mean(GQ) | |||||
MQ_m = statistics.mean(MQ) | |||||
DP_a = sum(DP) | |||||
ALT_a = sum(ALT) | |||||
return AF_m,GQ_m,MQ_m,DP_a,ALT_a | |||||
file_name = prefix + '_variant_quality_location.txt' | |||||
outfile = open(file_name,'w') | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ 'location' + '\t' + 'AF' + '\t' + 'GQ' + '\t' + 'MQ' + '\t' + 'DP' + '\t' + 'ALT' +'\n' | |||||
outfile.write(outputcolumn) | |||||
for line in fileinput.input(multi_sample_vcf): | |||||
m = re.match('^\#',line) | |||||
if m is not None: | |||||
pass | |||||
else: | |||||
line = line.strip() | |||||
strings = line.split('\t') | |||||
repeat = get_location(strings[7]) | |||||
AF,GQ,MQ,DP,ALT = extract_info_normal(strings[9:]) | |||||
outLine = '\t'.join(strings) + '\t' + repeat +'\t' + str(AF) + '\t' + str(GQ) + '\t' + str(MQ) + '\t' + str(DP) + '\t' + str(ALT) + '\n' | |||||
outfile.write(outLine) | |||||
from __future__ import division | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import statistics | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to get mapping quality, allele frequency and alternative depth") | |||||
parser.add_argument('-vcf', '--normed_vcf', type=str, help='The VCF file you want to used', required=True) | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||||
args = parser.parse_args() | |||||
normed_vcf = args.normed_vcf | |||||
prefix = args.prefix | |||||
file_name = prefix + '.variant_quality_location.vcf' | |||||
outfile = open(file_name,'w') | |||||
for line in fileinput.input(normed_vcf): | |||||
m = re.match('^\#',line) | |||||
if m is not None: | |||||
outfile.write(line) | |||||
else: | |||||
line = line.strip() | |||||
strings = line.split('\t') | |||||
strings[8] = strings[8] + ':MQ:ALT:AF' | |||||
infos = strings[7].strip().split(';') | |||||
## MQ | |||||
for element in infos: | |||||
m = re.match('MQ=',element) | |||||
if m is not None: | |||||
MQ = element.split('=')[1] | |||||
## ALT | |||||
ad = strings[9].split(':')[1] | |||||
ad_single = ad.split(',') | |||||
ad_single = [int(i) for i in ad_single] | |||||
DP = sum(ad_single) | |||||
if DP != 0: | |||||
ad_single.pop(0) | |||||
ALT = sum(ad_single) | |||||
AF = ALT/DP | |||||
else: | |||||
ALT = 0 | |||||
AF = 'NA' | |||||
outLine = '\t'.join(strings) + ':' + MQ + ':' + str(ALT) + ':' + str(AF) + '\n' | |||||
outfile.write(outLine) | |||||
{ | { | ||||
"{{ project_name }}.LCL6normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL5extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | ||||
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL8mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL8mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | "{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | ||||
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL8extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL5normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL7extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | "{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | ||||
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | "{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | ||||
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL5bedAnnotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL5bedAnnotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL6mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL6extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||||
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | "{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | ||||
"{{ project_name }}.LCL7mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL7mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | ||||
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | "{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", |
command <<< | command <<< | ||||
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz | |||||
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz | |||||
## DP | |||||
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk ' | |||||
BEGIN { OFS = "\t" } | |||||
NF > 2 && FNR > 1 { | |||||
for ( i=9; i<=NF; i++ ) { | |||||
split($i,a,":") ;$i = a[3]; | |||||
} | |||||
} | |||||
{ print } | |||||
' > ${sample}.depth.txt | |||||
## GQ | |||||
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk ' | |||||
BEGIN { OFS = "\t" } | |||||
NF > 2 && FNR > 1 { | |||||
for ( i=9; i<=NF; i++ ) { | |||||
split($i,a,":") ;$i = a[4]; | |||||
} | |||||
} | |||||
{ print } | |||||
' > ${sample}.genotypeQuality.txt | |||||
## MQ | |||||
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk ' | |||||
BEGIN { OFS = "\t" } | |||||
NF > 2 && FNR > 1 { | |||||
for ( i=9; i<=NF; i++ ) { | |||||
split($i,a,":") ;$i = a[6]; | |||||
} | |||||
} | |||||
{ print } | |||||
' > ${sample}.mappinyQuality.txt | |||||
## Allele frequency | |||||
>>> | >>> | ||||
} | } | ||||
output { | output { | ||||
File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf.gz" | File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf.gz" | ||||
File repeat_annotated_vcf_idx = "${sample}.normed.repeatAnno.vcf.gz.tbi" | |||||
} | } | ||||
} | } |
task extract_info { | task extract_info { | ||||
File vcf | |||||
String vcf_name = basename(vcf,".vcf") | |||||
File normed_vcf | |||||
String sampleName | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
python /opt/extract_vcf_information.py -i ${vcf} -o ${vcf_name}.txt | |||||
cat ${vcf_name}.txt | cut -f23,25,27,22,12,21,3,18,4,8,11,15 > ${vcf_name}.essential.txt | |||||
python /opt/vcf_mq_af.py -vcf ${normed_vcf} -prefix ${sampleName} | |||||
>>> | >>> | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File vcf_info = "${vcf_name}.txt" | |||||
File vcf_needed_info = "${vcf_name}.essential.txt" | |||||
File vcf_info = "${sampleName}.variant_quality_location.vcf" | |||||
} | } | ||||
} | } |
import "./tasks/variantsNorm.wdl" as variantsNorm | import "./tasks/variantsNorm.wdl" as variantsNorm | ||||
import "./tasks/extract_info.wdl" as extract_info | |||||
import "./tasks/mendelian.wdl" as mendelian | import "./tasks/mendelian.wdl" as mendelian | ||||
import "./tasks/zipIndex.wdl" as zipIndex | import "./tasks/zipIndex.wdl" as zipIndex | ||||
import "./tasks/VCFrename.wdl" as VCFrename | import "./tasks/VCFrename.wdl" as VCFrename | ||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call extract_info.extract_info as LCL5extract_info { | |||||
input: | |||||
normed_vcf=LCL5variantsNorm.normed_vcf, | |||||
sampleName=quartet[4], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call extract_info.extract_info as LCL6extract_info { | |||||
input: | |||||
normed_vcf=LCL6variantsNorm.normed_vcf, | |||||
sampleName=quartet[5], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call extract_info.extract_info as LCL7extract_info { | |||||
input: | |||||
normed_vcf=LCL7variantsNorm.normed_vcf, | |||||
sampleName=quartet[6], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call extract_info.extract_info as LCL8extract_info { | |||||
input: | |||||
normed_vcf=LCL8variantsNorm.normed_vcf, | |||||
sampleName=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call zipIndex.zipIndex as LCL5normZip{ | call zipIndex.zipIndex as LCL5normZip{ | ||||
input: | input: | ||||
vcf=LCL5variantsNorm.normed_mq_vcf, | |||||
vcf=LCL5extract_info.vcf_info, | |||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call zipIndex.zipIndex as LCL6normZip{ | call zipIndex.zipIndex as LCL6normZip{ | ||||
input: | input: | ||||
vcf=LCL6variantsNorm.normed_mq_vcf, | |||||
vcf=LCL6extract_info.vcf_info, | |||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call zipIndex.zipIndex as LCL7normZip{ | call zipIndex.zipIndex as LCL7normZip{ | ||||
input: | input: | ||||
vcf=LCL7variantsNorm.normed_mq_vcf, | |||||
vcf=LCL7extract_info.vcf_info, | |||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call zipIndex.zipIndex as LCL8normZip{ | call zipIndex.zipIndex as LCL8normZip{ | ||||
input: | input: | ||||
vcf=LCL8variantsNorm.normed_mq_vcf, | |||||
vcf=LCL8extract_info.vcf_info, | |||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } |