@@ -22,10 +22,29 @@ output_file = open(output_name,'w') | |||
# input files | |||
sister_dat = pd.read_table(sister_file) | |||
sister_same = 0 | |||
sister_diff = 0 | |||
indel_sister_same = 0 | |||
indel_sister_diff = 0 | |||
snv_sister_same = 0 | |||
snv_sister_diff = 0 | |||
for row in sister_dat.itertuples(): | |||
# snv indel | |||
if ',' in row.ALT: | |||
alt = row.ALT.split(',') | |||
alt_len = [len(i) for i in alt] | |||
alt_max = max(alt_len) | |||
else: | |||
alt_max = len(row.ALT) | |||
alt = alt_max | |||
ref = row.REF | |||
if len(ref) == 1 and alt == 1: | |||
cate = 'SNV' | |||
elif len(ref) > alt: | |||
cate = 'INDEL' | |||
elif alt > len(ref): | |||
cate = 'INDEL' | |||
elif len(ref) == alt: | |||
cate = 'INDEL' | |||
# sister | |||
if row[5] == row[6]: | |||
if row[5] == './.': | |||
@@ -49,8 +68,23 @@ for row in sister_dat.itertuples(): | |||
sister_diff += 1 | |||
else: | |||
pass | |||
if cate == 'SNV': | |||
if sister_count == 'yes_same': | |||
snv_sister_same += 1 | |||
elif sister_count == 'yes_diff': | |||
snv_sister_diff += 1 | |||
else: | |||
pass | |||
elif cate == 'INDEL': | |||
if sister_count == 'yes_same': | |||
indel_sister_same += 1 | |||
elif sister_count == 'yes_diff': | |||
indel_sister_diff += 1 | |||
else: | |||
pass | |||
sister = sister_same/(sister_same + sister_diff) | |||
indel_sister = indel_sister_same/(indel_sister_same + indel_sister_diff) | |||
snv_sister = snv_sister_same/(snv_sister_same + snv_sister_diff) | |||
outcolumn = 'Project\tReproducibility_D5_D6\n' | |||
outResult = project_name + '\t' + str(sister) + '\n' | |||
output_file.write(outcolumn) |
@@ -1,36 +0,0 @@ | |||
from __future__ import division | |||
import pandas as pd | |||
import sys, argparse, os | |||
import fileinput | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to get filtered and benchmark vcf info") | |||
parser.add_argument('-filtered', '--filtered', type=str, help='filtered position', required=True) | |||
parser.add_argument('-benchmark', '--benchmark', type=str, help='benchmark position', required=True) | |||
parser.add_argument('-vcf', '--vcf', type=str, help='one specific vcf', required=True) | |||
parser.add_argument('-filename', '--filename', type=str, help='output file name', required=True) | |||
args = parser.parse_args() | |||
filtered = args.filtered | |||
benchmark = args.benchmark | |||
vcf = args.vcf | |||
filename = args.filename | |||
# output file | |||
filtered_filename = filename + '.filtered.txt' | |||
benchmark_filename = filename + '.benchmark.txt' | |||
# input files | |||
filtered_dat = pd.read_table(filtered,header=None) | |||
benchmark_dat = pd.read_table(benchmark,header=None) | |||
vcf_dat = pd.read_table(vcf) | |||
filtered_merged_df = pd.merge(filtered_dat, vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS']) | |||
benchmark_merged_df = pd.merge(benchmark_dat,vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS']) | |||
filtered_merged_df.to_csv(filtered_filename,sep='\t',index=False) | |||
benchmark_merged_df.to_csv(benchmark_filename,sep='\t',index=False) |
@@ -39,12 +39,35 @@ merged_genotype_df = pd.merge(merged_df, genotype_dat, how='outer', left_on=['# | |||
merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,23,24,29,30,31,32,7,17]] | |||
merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6'] | |||
sister_same = 0 | |||
sister_diff = 0 | |||
family_all = 0 | |||
family_mendelian = 0 | |||
indel_sister_same = 0 | |||
indel_sister_diff = 0 | |||
indel_family_all = 0 | |||
indel_family_mendelian = 0 | |||
snv_sister_same = 0 | |||
snv_sister_diff = 0 | |||
snv_family_all = 0 | |||
snv_family_mendelian = 0 | |||
for row in merged_genotype_df_sub.itertuples(): | |||
# indel or snv | |||
if ',' in row.ALT: | |||
alt = row.ALT.split(',') | |||
alt_len = [len(i) for i in alt] | |||
alt_max = max(alt_len) | |||
else: | |||
alt_max = len(row.ALT) | |||
alt = alt_max | |||
ref = row.REF | |||
if len(ref) == 1 and alt == 1: | |||
cate = 'SNV' | |||
elif len(ref) > alt: | |||
cate = 'INDEL' | |||
elif alt > len(ref): | |||
cate = 'INDEL' | |||
elif len(ref) == alt: | |||
cate = 'INDEL' | |||
# sister | |||
if row.LCL5 == row.LCL6: | |||
if row.LCL5 == './.': | |||
@@ -61,13 +84,7 @@ for row in merged_genotype_df_sub.itertuples(): | |||
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'): | |||
sister_count = "no" | |||
else: | |||
sister_count = "yes_diff" | |||
if sister_count == 'yes_same': | |||
sister_same += 1 | |||
elif sister_count == 'yes_diff': | |||
sister_diff += 1 | |||
else: | |||
pass | |||
sister_count = "yes_diff" | |||
# family trio5 | |||
if row.LCL5 == row. LCL7 == row.LCL8 == './.': | |||
mendelian = mendelian + ':noInfo' | |||
@@ -91,25 +108,53 @@ for row in merged_genotype_df_sub.itertuples(): | |||
mendelian_count = "no" | |||
else: | |||
mendelian_count = "yes" | |||
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n' | |||
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + cate + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n' | |||
family_file.write(outline) | |||
if mendelian_count == 'yes': | |||
family_all += 1 | |||
else: | |||
pass | |||
if mendelian == '1:1:1': | |||
family_mendelian += 1 | |||
elif mendelian == 'Ref:1:1': | |||
family_mendelian += 1 | |||
else: | |||
pass | |||
if cate == 'SNV': | |||
if sister_count == 'yes_same': | |||
snv_sister_same += 1 | |||
elif sister_count == 'yes_diff': | |||
snv_sister_diff += 1 | |||
else: | |||
pass | |||
if mendelian_count == 'yes': | |||
snv_family_all += 1 | |||
else: | |||
pass | |||
if mendelian == '1:1:1': | |||
snv_family_mendelian += 1 | |||
elif mendelian == 'Ref:1:1': | |||
snv_family_mendelian += 1 | |||
else: | |||
pass | |||
elif cate == 'INDEL': | |||
if sister_count == 'yes_same': | |||
indel_sister_same += 1 | |||
elif sister_count == 'yes_diff': | |||
indel_sister_diff += 1 | |||
else: | |||
pass | |||
if mendelian_count == 'yes': | |||
indel_family_all += 1 | |||
else: | |||
pass | |||
if mendelian == '1:1:1': | |||
indel_family_mendelian += 1 | |||
elif mendelian == 'Ref:1:1': | |||
indel_family_mendelian += 1 | |||
else: | |||
pass | |||
sister = sister_same/(sister_same + sister_diff) | |||
quartet = family_mendelian/family_all | |||
snv_sister = snv_sister_same/(snv_sister_same + snv_sister_diff) | |||
indel_sister = indel_sister_same/(indel_sister_same + indel_sister_diff) | |||
snv_quartet = snv_family_mendelian/snv_family_all | |||
indel_quartet = indel_family_mendelian/indel_family_all | |||
outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n' | |||
outResult = family + '\t' + str(sister) + '\t' + str(quartet) + '\n' | |||
indel_outResult = family + '_INDEL' + '\t' + str(indel_sister) + '\t' + str(indel_quartet) + '\n' | |||
snv_outResult = family + '_SNV' + '\t' + str(snv_sister) + '\t' + str(snv_quartet) + '\n' | |||
summary_file.write(outcolumn) | |||
summary_file.write(outResult) | |||
summary_file.write(indel_outResult) | |||
summary_file.write(snv_outResult) | |||
@@ -0,0 +1,23 @@ | |||
{ | |||
"benchmarking_dir": "oss://pgx-result/renluyao/manuscript/benchmark_calls_v3.0/", | |||
"SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics", | |||
"fasta": "GRCh38.d1.vd1.fa", | |||
"BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | |||
"dbsnp_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||
"disk_size": "500", | |||
"FASTQCdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:v0.11.5", | |||
"SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc", | |||
"screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/", | |||
"dbmills_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||
"BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc", | |||
"fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf", | |||
"MULTIQCdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/multiqc:v1.8", | |||
"FASTQSCREENdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0", | |||
"SENTIEONdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/sentieon-genomics:v2018.08.01", | |||
"QUALIMAPdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/qualimap:2.0.0", | |||
"db_mills": "Mills_and_1000G_gold_standard.indels.hg38.vcf", | |||
"dbsnp": "dbsnp_146.hg38.vcf", | |||
"MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||
"ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/" | |||
} |
@@ -1,25 +1,25 @@ | |||
{ | |||
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript/benchmark_calls_v3.0/", | |||
"{{ project_name }}.SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics", | |||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||
"{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | |||
"{{ project_name }}.dbsnp_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||
"{{ project_name }}.disk_size": "500", | |||
"{{ project_name }}.benchmarking_dir": "{{ benchmarking_dir }}", | |||
"{{ project_name }}.SENTIEON_INSTALL_DIR": "{{ SENTIEON_INSTALL_DIR }}", | |||
"{{ project_name }}.fasta": "{{ fasta }}", | |||
"{{ project_name }}.BENCHMARKdocker": "{{ BENCHMARKdocker }}", | |||
"{{ project_name }}.dbsnp_dir": "{{ dbsnp_dir }}", | |||
"{{ project_name }}.disk_size": "{{ disk_size }}", | |||
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | |||
"{{ project_name }}.FASTQCdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:v0.11.5", | |||
"{{ project_name }}.FASTQCdocker": "{{ FASTQCdocker }}", | |||
"{{ project_name }}.MULTIQCdocker": "{{ MULTIQCdocker }}", | |||
"{{ project_name }}.project": "{{ project }}", | |||
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc", | |||
"{{ project_name }}.screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/", | |||
"{{ project_name }}.dbmills_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc", | |||
"{{ project_name }}.fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf", | |||
"{{ project_name }}.multiqc.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/multiqc:v1.8", | |||
"{{ project_name }}.FASTQSCREENdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0", | |||
"{{ project_name }}.SENTIEONdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/sentieon-genomics:v2018.08.01", | |||
"{{ project_name }}.QUALIMAPdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/qualimap:2.0.0", | |||
"{{ project_name }}.db_mills": "Mills_and_1000G_gold_standard.indels.hg38.vcf", | |||
"{{ project_name }}.dbsnp": "dbsnp_146.hg38.vcf", | |||
"{{ project_name }}.MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"{{ project_name }}.DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||
"{{ project_name }}.ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/" | |||
"{{ project_name }}.SMALLcluster_config": "{{ SMALLcluster_config }}", | |||
"{{ project_name }}.screen_ref_dir": "{{ screen_ref_dir }}", | |||
"{{ project_name }}.dbmills_dir": "{{ dbmills_dir }}", | |||
"{{ project_name }}.BIGcluster_config": "{{ BIGcluster_config }}", | |||
"{{ project_name }}.fastq_screen_conf": "{{ fastq_screen_conf }}", | |||
"{{ project_name }}.FASTQSCREENdocker": "{{ FASTQSCREENdocker }}", | |||
"{{ project_name }}.SENTIEONdocker": "{{ SENTIEONdocker }}", | |||
"{{ project_name }}.QUALIMAPdocker": "{{ QUALIMAPdocker }}", | |||
"{{ project_name }}.db_mills": "{{ db_mills }}", | |||
"{{ project_name }}.dbsnp": "{{ dbsnp }}", | |||
"{{ project_name }}.MENDELIANdocker": "{{ MENDELIANdocker }}", | |||
"{{ project_name }}.DIYdocker": "{{ DIYdocker }}", | |||
"{{ project_name }}.ref_dir": "{{ ref_dir }}" | |||
} |
@@ -18,7 +18,19 @@ task benchmark { | |||
export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip ${vcf} -c > ${sample}.rtg.vcf.gz | |||
cat ${vcf} | grep '#' > header | |||
cat ${vcf} | grep -v '#' | grep -v '0/0' | grep -v '\./\.'| awk ' | |||
BEGIN { OFS = "\t" } | |||
{ | |||
for ( i=9; i<=NF; i++ ) { | |||
split($i,a,":") ;$i = a[1]; | |||
} | |||
} | |||
{ print } | |||
' > body | |||
cat header body > filtered.vcf | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip filtered.vcf -c > ${sample}.rtg.vcf.gz | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz | |||
if [[ ${sample} =~ "LCL5" ]];then |
@@ -10,7 +10,7 @@ task qualimap { | |||
set -o pipefail | |||
set -e | |||
nt=$(nproc) | |||
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=32G | |||
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=60G | |||
tar -zcvf ${bamname}_qualimap.zip ${bamname} | |||
>>> | |||
@@ -8,7 +8,7 @@ task quartet_mendelian { | |||
command <<< | |||
for i in ${sep=" " project_mendelian_summary} | |||
do | |||
cat $i | sed -n '2,2p' >> mendelian.summary | |||
cat $i | sed -n '2,3p' >> mendelian.summary | |||
done | |||
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > ${project}.mendelian.txt | |||
@@ -33,6 +33,7 @@ workflow {{ project_name }} { | |||
String BENCHMARKdocker | |||
String MENDELIANdocker | |||
String DIYdocker | |||
String MULTIQCdocker | |||
String fasta | |||
File ref_dir | |||
@@ -225,6 +226,7 @@ workflow {{ project_name }} { | |||
txt2=fastqscreen.txt2, | |||
zip=qualimap.zip, | |||
summary=benchmark.summary, | |||
docker=MULTIQCdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} |