@@ -1,4 +1,4 @@ | |||
# import modules | |||
from __future__ import division | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
@@ -6,7 +6,6 @@ import pandas as pd | |||
from operator import itemgetter | |||
from collections import Counter | |||
from itertools import islice | |||
from __future__ import division | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to count voting number") |
@@ -0,0 +1,78 @@ | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
import statistics | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to intergeate vcf information, variants quality and location") | |||
parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True) | |||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||
args = parser.parse_args() | |||
multi_sample_vcf = args.multi_sample_vcf | |||
prefix = args.prefix | |||
def get_location(info): | |||
repeat = '' | |||
if 'ANN' in info: | |||
strings = info.strip().split(';') | |||
for element in strings: | |||
m = re.match('ANN',element) | |||
if m is not None: | |||
repeat = element.split('=')[1] | |||
else: | |||
repeat = '.' | |||
return repeat | |||
def extract_info_normal(strings): | |||
AF = [] | |||
GQ = [] | |||
MQ = [] | |||
DP = [] | |||
ALT = [] | |||
for element in strings: | |||
if element == '.': | |||
pass | |||
else: | |||
ad = element.split(':')[1] | |||
ref = ad.split(',')[0] | |||
alt = ad.split(',')[1] | |||
af = float(int(alt)/(int(ref) + int(alt))) | |||
gq = int(element.split(':')[3]) | |||
mq = float(element.split(':')[5]) | |||
dp = int(element.split(':')[2]) | |||
AF.append(af) | |||
GQ.append(gq) | |||
MQ.append(mq) | |||
DP.append(dp) | |||
ALT.append(int(alt)) | |||
AF_m = statistics.mean(AF) | |||
GQ_m = statistics.mean(GQ) | |||
MQ_m = statistics.mean(MQ) | |||
DP_a = sum(DP) | |||
ALT_a = sum(ALT) | |||
return AF_m,GQ_m,MQ_m,DP_a,ALT_a | |||
file_name = prefix + '_variant_quality_location.txt' | |||
outfile = open(file_name,'w') | |||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ 'location' + '\t' + 'AF' + '\t' + 'GQ' + '\t' + 'MQ' + '\t' + 'DP' + '\t' + 'ALT' +'\n' | |||
outfile.write(outputcolumn) | |||
for line in fileinput.input(multi_sample_vcf): | |||
m = re.match('^\#',line) | |||
if m is not None: | |||
pass | |||
else: | |||
line = line.strip() | |||
strings = line.split('\t') | |||
repeat = get_location(strings[7]) | |||
AF,GQ,MQ,DP,ALT = extract_info_normal(strings[9:]) | |||
outLine = '\t'.join(strings) + '\t' + repeat +'\t' + str(AF) + '\t' + str(GQ) + '\t' + str(MQ) + '\t' + str(DP) + '\t' + str(ALT) + '\n' | |||
outfile.write(outLine) | |||
@@ -0,0 +1,52 @@ | |||
from __future__ import division | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
import statistics | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to get mapping quality, allele frequency and alternative depth") | |||
parser.add_argument('-vcf', '--normed_vcf', type=str, help='The VCF file you want to used', required=True) | |||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||
args = parser.parse_args() | |||
normed_vcf = args.normed_vcf | |||
prefix = args.prefix | |||
file_name = prefix + '.variant_quality_location.vcf' | |||
outfile = open(file_name,'w') | |||
for line in fileinput.input(normed_vcf): | |||
m = re.match('^\#',line) | |||
if m is not None: | |||
outfile.write(line) | |||
else: | |||
line = line.strip() | |||
strings = line.split('\t') | |||
strings[8] = strings[8] + ':MQ:ALT:AF' | |||
infos = strings[7].strip().split(';') | |||
## MQ | |||
for element in infos: | |||
m = re.match('MQ=',element) | |||
if m is not None: | |||
MQ = element.split('=')[1] | |||
## ALT | |||
ad = strings[9].split(':')[1] | |||
ad_single = ad.split(',') | |||
ad_single = [int(i) for i in ad_single] | |||
DP = sum(ad_single) | |||
if DP != 0: | |||
ad_single.pop(0) | |||
ALT = sum(ad_single) | |||
AF = ALT/DP | |||
else: | |||
ALT = 0 | |||
AF = 'NA' | |||
outLine = '\t'.join(strings) + ':' + MQ + ':' + str(ALT) + ':' + str(AF) + '\n' | |||
outfile.write(outLine) | |||
@@ -1,5 +1,6 @@ | |||
{ | |||
"{{ project_name }}.LCL6normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||
"{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||
"{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
@@ -21,9 +22,11 @@ | |||
"{{ project_name }}.LCL8mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||
"{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL8extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||
"{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5normZip.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL7extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||
"{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL8votes.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||
"{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1", | |||
@@ -32,6 +35,7 @@ | |||
"{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL5bedAnnotation.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL6extract_info.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.1", | |||
"{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9", | |||
"{{ project_name }}.LCL7mergeVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", | |||
"{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest", |
@@ -9,7 +9,43 @@ task bed_annotation { | |||
command <<< | |||
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz | |||
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf} -o ${sample}.normed.repeatAnno.vcf.gz | |||
## DP | |||
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk ' | |||
BEGIN { OFS = "\t" } | |||
NF > 2 && FNR > 1 { | |||
for ( i=9; i<=NF; i++ ) { | |||
split($i,a,":") ;$i = a[3]; | |||
} | |||
} | |||
{ print } | |||
' > ${sample}.depth.txt | |||
## GQ | |||
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk ' | |||
BEGIN { OFS = "\t" } | |||
NF > 2 && FNR > 1 { | |||
for ( i=9; i<=NF; i++ ) { | |||
split($i,a,":") ;$i = a[4]; | |||
} | |||
} | |||
{ print } | |||
' > ${sample}.genotypeQuality.txt | |||
## MQ | |||
zcat ${sample}.normed.repeatAnno.vcf.gz | grep -v '##' | awk ' | |||
BEGIN { OFS = "\t" } | |||
NF > 2 && FNR > 1 { | |||
for ( i=9; i<=NF; i++ ) { | |||
split($i,a,":") ;$i = a[6]; | |||
} | |||
} | |||
{ print } | |||
' > ${sample}.mappinyQuality.txt | |||
## Allele frequency | |||
>>> | |||
@@ -21,5 +57,6 @@ task bed_annotation { | |||
} | |||
output { | |||
File repeat_annotated_vcf = "${sample}.normed.repeatAnno.vcf.gz" | |||
File repeat_annotated_vcf_idx = "${sample}.normed.repeatAnno.vcf.gz.tbi" | |||
} | |||
} |
@@ -1,14 +1,13 @@ | |||
task extract_info { | |||
File vcf | |||
String vcf_name = basename(vcf,".vcf") | |||
File normed_vcf | |||
String sampleName | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
python /opt/extract_vcf_information.py -i ${vcf} -o ${vcf_name}.txt | |||
cat ${vcf_name}.txt | cut -f23,25,27,22,12,21,3,18,4,8,11,15 > ${vcf_name}.essential.txt | |||
python /opt/vcf_mq_af.py -vcf ${normed_vcf} -prefix ${sampleName} | |||
>>> | |||
@@ -19,7 +18,6 @@ task extract_info { | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File vcf_info = "${vcf_name}.txt" | |||
File vcf_needed_info = "${vcf_name}.essential.txt" | |||
File vcf_info = "${sampleName}.variant_quality_location.vcf" | |||
} | |||
} |
@@ -1,4 +1,5 @@ | |||
import "./tasks/variantsNorm.wdl" as variantsNorm | |||
import "./tasks/extract_info.wdl" as extract_info | |||
import "./tasks/mendelian.wdl" as mendelian | |||
import "./tasks/zipIndex.wdl" as zipIndex | |||
import "./tasks/VCFrename.wdl" as VCFrename | |||
@@ -55,27 +56,55 @@ workflow {{ project_name }} { | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call extract_info.extract_info as LCL5extract_info { | |||
input: | |||
normed_vcf=LCL5variantsNorm.normed_vcf, | |||
sampleName=quartet[4], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call extract_info.extract_info as LCL6extract_info { | |||
input: | |||
normed_vcf=LCL6variantsNorm.normed_vcf, | |||
sampleName=quartet[5], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call extract_info.extract_info as LCL7extract_info { | |||
input: | |||
normed_vcf=LCL7variantsNorm.normed_vcf, | |||
sampleName=quartet[6], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call extract_info.extract_info as LCL8extract_info { | |||
input: | |||
normed_vcf=LCL8variantsNorm.normed_vcf, | |||
sampleName=quartet[8], | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL5normZip{ | |||
input: | |||
vcf=LCL5variantsNorm.normed_mq_vcf, | |||
vcf=LCL5extract_info.vcf_info, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL6normZip{ | |||
input: | |||
vcf=LCL6variantsNorm.normed_mq_vcf, | |||
vcf=LCL6extract_info.vcf_info, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL7normZip{ | |||
input: | |||
vcf=LCL7variantsNorm.normed_mq_vcf, | |||
vcf=LCL7extract_info.vcf_info, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} | |||
call zipIndex.zipIndex as LCL8normZip{ | |||
input: | |||
vcf=LCL8variantsNorm.normed_mq_vcf, | |||
vcf=LCL8extract_info.vcf_info, | |||
cluster_config=cluster_config, | |||
disk_size=disk_size | |||
} |