# input files | # input files | ||||
sister_dat = pd.read_table(sister_file) | sister_dat = pd.read_table(sister_file) | ||||
sister_same = 0 | |||||
sister_diff = 0 | |||||
indel_sister_same = 0 | |||||
indel_sister_diff = 0 | |||||
snv_sister_same = 0 | |||||
snv_sister_diff = 0 | |||||
for row in sister_dat.itertuples(): | for row in sister_dat.itertuples(): | ||||
# snv indel | |||||
if ',' in row.ALT: | |||||
alt = row.ALT.split(',') | |||||
alt_len = [len(i) for i in alt] | |||||
alt_max = max(alt_len) | |||||
else: | |||||
alt_max = len(row.ALT) | |||||
alt = alt_max | |||||
ref = row.REF | |||||
if len(ref) == 1 and alt == 1: | |||||
cate = 'SNV' | |||||
elif len(ref) > alt: | |||||
cate = 'INDEL' | |||||
elif alt > len(ref): | |||||
cate = 'INDEL' | |||||
elif len(ref) == alt: | |||||
cate = 'INDEL' | |||||
# sister | # sister | ||||
if row[5] == row[6]: | if row[5] == row[6]: | ||||
if row[5] == './.': | if row[5] == './.': | ||||
sister_diff += 1 | sister_diff += 1 | ||||
else: | else: | ||||
pass | pass | ||||
if cate == 'SNV': | |||||
if sister_count == 'yes_same': | |||||
snv_sister_same += 1 | |||||
elif sister_count == 'yes_diff': | |||||
snv_sister_diff += 1 | |||||
else: | |||||
pass | |||||
elif cate == 'INDEL': | |||||
if sister_count == 'yes_same': | |||||
indel_sister_same += 1 | |||||
elif sister_count == 'yes_diff': | |||||
indel_sister_diff += 1 | |||||
else: | |||||
pass | |||||
sister = sister_same/(sister_same + sister_diff) | |||||
indel_sister = indel_sister_same/(indel_sister_same + indel_sister_diff) | |||||
snv_sister = snv_sister_same/(snv_sister_same + snv_sister_diff) | |||||
outcolumn = 'Project\tReproducibility_D5_D6\n' | outcolumn = 'Project\tReproducibility_D5_D6\n' | ||||
outResult = project_name + '\t' + str(sister) + '\n' | outResult = project_name + '\t' + str(sister) + '\n' | ||||
output_file.write(outcolumn) | output_file.write(outcolumn) |
from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to get filtered and benchmark vcf info") | |||||
parser.add_argument('-filtered', '--filtered', type=str, help='filtered position', required=True) | |||||
parser.add_argument('-benchmark', '--benchmark', type=str, help='benchmark position', required=True) | |||||
parser.add_argument('-vcf', '--vcf', type=str, help='one specific vcf', required=True) | |||||
parser.add_argument('-filename', '--filename', type=str, help='output file name', required=True) | |||||
args = parser.parse_args() | |||||
filtered = args.filtered | |||||
benchmark = args.benchmark | |||||
vcf = args.vcf | |||||
filename = args.filename | |||||
# output file | |||||
filtered_filename = filename + '.filtered.txt' | |||||
benchmark_filename = filename + '.benchmark.txt' | |||||
# input files | |||||
filtered_dat = pd.read_table(filtered,header=None) | |||||
benchmark_dat = pd.read_table(benchmark,header=None) | |||||
vcf_dat = pd.read_table(vcf) | |||||
filtered_merged_df = pd.merge(filtered_dat, vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS']) | |||||
benchmark_merged_df = pd.merge(benchmark_dat,vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS']) | |||||
filtered_merged_df.to_csv(filtered_filename,sep='\t',index=False) | |||||
benchmark_merged_df.to_csv(benchmark_filename,sep='\t',index=False) |
merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,23,24,29,30,31,32,7,17]] | merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,23,24,29,30,31,32,7,17]] | ||||
merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6'] | merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6'] | ||||
sister_same = 0 | |||||
sister_diff = 0 | |||||
family_all = 0 | |||||
family_mendelian = 0 | |||||
indel_sister_same = 0 | |||||
indel_sister_diff = 0 | |||||
indel_family_all = 0 | |||||
indel_family_mendelian = 0 | |||||
snv_sister_same = 0 | |||||
snv_sister_diff = 0 | |||||
snv_family_all = 0 | |||||
snv_family_mendelian = 0 | |||||
for row in merged_genotype_df_sub.itertuples(): | for row in merged_genotype_df_sub.itertuples(): | ||||
# indel or snv | |||||
if ',' in row.ALT: | |||||
alt = row.ALT.split(',') | |||||
alt_len = [len(i) for i in alt] | |||||
alt_max = max(alt_len) | |||||
else: | |||||
alt_max = len(row.ALT) | |||||
alt = alt_max | |||||
ref = row.REF | |||||
if len(ref) == 1 and alt == 1: | |||||
cate = 'SNV' | |||||
elif len(ref) > alt: | |||||
cate = 'INDEL' | |||||
elif alt > len(ref): | |||||
cate = 'INDEL' | |||||
elif len(ref) == alt: | |||||
cate = 'INDEL' | |||||
# sister | # sister | ||||
if row.LCL5 == row.LCL6: | if row.LCL5 == row.LCL6: | ||||
if row.LCL5 == './.': | if row.LCL5 == './.': | ||||
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'): | if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'): | ||||
sister_count = "no" | sister_count = "no" | ||||
else: | else: | ||||
sister_count = "yes_diff" | |||||
if sister_count == 'yes_same': | |||||
sister_same += 1 | |||||
elif sister_count == 'yes_diff': | |||||
sister_diff += 1 | |||||
else: | |||||
pass | |||||
sister_count = "yes_diff" | |||||
# family trio5 | # family trio5 | ||||
if row.LCL5 == row. LCL7 == row.LCL8 == './.': | if row.LCL5 == row. LCL7 == row.LCL8 == './.': | ||||
mendelian = mendelian + ':noInfo' | mendelian = mendelian + ':noInfo' | ||||
mendelian_count = "no" | mendelian_count = "no" | ||||
else: | else: | ||||
mendelian_count = "yes" | mendelian_count = "yes" | ||||
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n' | |||||
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + cate + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n' | |||||
family_file.write(outline) | family_file.write(outline) | ||||
if mendelian_count == 'yes': | |||||
family_all += 1 | |||||
else: | |||||
pass | |||||
if mendelian == '1:1:1': | |||||
family_mendelian += 1 | |||||
elif mendelian == 'Ref:1:1': | |||||
family_mendelian += 1 | |||||
else: | |||||
pass | |||||
if cate == 'SNV': | |||||
if sister_count == 'yes_same': | |||||
snv_sister_same += 1 | |||||
elif sister_count == 'yes_diff': | |||||
snv_sister_diff += 1 | |||||
else: | |||||
pass | |||||
if mendelian_count == 'yes': | |||||
snv_family_all += 1 | |||||
else: | |||||
pass | |||||
if mendelian == '1:1:1': | |||||
snv_family_mendelian += 1 | |||||
elif mendelian == 'Ref:1:1': | |||||
snv_family_mendelian += 1 | |||||
else: | |||||
pass | |||||
elif cate == 'INDEL': | |||||
if sister_count == 'yes_same': | |||||
indel_sister_same += 1 | |||||
elif sister_count == 'yes_diff': | |||||
indel_sister_diff += 1 | |||||
else: | |||||
pass | |||||
if mendelian_count == 'yes': | |||||
indel_family_all += 1 | |||||
else: | |||||
pass | |||||
if mendelian == '1:1:1': | |||||
indel_family_mendelian += 1 | |||||
elif mendelian == 'Ref:1:1': | |||||
indel_family_mendelian += 1 | |||||
else: | |||||
pass | |||||
sister = sister_same/(sister_same + sister_diff) | |||||
quartet = family_mendelian/family_all | |||||
snv_sister = snv_sister_same/(snv_sister_same + snv_sister_diff) | |||||
indel_sister = indel_sister_same/(indel_sister_same + indel_sister_diff) | |||||
snv_quartet = snv_family_mendelian/snv_family_all | |||||
indel_quartet = indel_family_mendelian/indel_family_all | |||||
outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n' | outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n' | ||||
outResult = family + '\t' + str(sister) + '\t' + str(quartet) + '\n' | |||||
indel_outResult = family + '_INDEL' + '\t' + str(indel_sister) + '\t' + str(indel_quartet) + '\n' | |||||
snv_outResult = family + '_SNV' + '\t' + str(snv_sister) + '\t' + str(snv_quartet) + '\n' | |||||
summary_file.write(outcolumn) | summary_file.write(outcolumn) | ||||
summary_file.write(outResult) | |||||
summary_file.write(indel_outResult) | |||||
summary_file.write(snv_outResult) | |||||
{ | |||||
"benchmarking_dir": "oss://pgx-result/renluyao/manuscript/benchmark_calls_v3.0/", | |||||
"SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics", | |||||
"fasta": "GRCh38.d1.vd1.fa", | |||||
"BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | |||||
"dbsnp_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||||
"disk_size": "500", | |||||
"FASTQCdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:v0.11.5", | |||||
"SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc", | |||||
"screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/", | |||||
"dbmills_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||||
"BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc", | |||||
"fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf", | |||||
"MULTIQCdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/multiqc:v1.8", | |||||
"FASTQSCREENdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0", | |||||
"SENTIEONdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/sentieon-genomics:v2018.08.01", | |||||
"QUALIMAPdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/qualimap:2.0.0", | |||||
"db_mills": "Mills_and_1000G_gold_standard.indels.hg38.vcf", | |||||
"dbsnp": "dbsnp_146.hg38.vcf", | |||||
"MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||||
"DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||||
"ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/" | |||||
} |
{ | { | ||||
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript/benchmark_calls_v3.0/", | |||||
"{{ project_name }}.SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics", | |||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||||
"{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | |||||
"{{ project_name }}.dbsnp_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||||
"{{ project_name }}.disk_size": "500", | |||||
"{{ project_name }}.benchmarking_dir": "{{ benchmarking_dir }}", | |||||
"{{ project_name }}.SENTIEON_INSTALL_DIR": "{{ SENTIEON_INSTALL_DIR }}", | |||||
"{{ project_name }}.fasta": "{{ fasta }}", | |||||
"{{ project_name }}.BENCHMARKdocker": "{{ BENCHMARKdocker }}", | |||||
"{{ project_name }}.dbsnp_dir": "{{ dbsnp_dir }}", | |||||
"{{ project_name }}.disk_size": "{{ disk_size }}", | |||||
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | "{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | ||||
"{{ project_name }}.FASTQCdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:v0.11.5", | |||||
"{{ project_name }}.FASTQCdocker": "{{ FASTQCdocker }}", | |||||
"{{ project_name }}.MULTIQCdocker": "{{ MULTIQCdocker }}", | |||||
"{{ project_name }}.project": "{{ project }}", | "{{ project_name }}.project": "{{ project }}", | ||||
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/", | |||||
"{{ project_name }}.dbmills_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||||
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf", | |||||
"{{ project_name }}.multiqc.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/multiqc:v1.8", | |||||
"{{ project_name }}.FASTQSCREENdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0", | |||||
"{{ project_name }}.SENTIEONdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/sentieon-genomics:v2018.08.01", | |||||
"{{ project_name }}.QUALIMAPdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/qualimap:2.0.0", | |||||
"{{ project_name }}.db_mills": "Mills_and_1000G_gold_standard.indels.hg38.vcf", | |||||
"{{ project_name }}.dbsnp": "dbsnp_146.hg38.vcf", | |||||
"{{ project_name }}.MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||||
"{{ project_name }}.DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||||
"{{ project_name }}.ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/" | |||||
"{{ project_name }}.SMALLcluster_config": "{{ SMALLcluster_config }}", | |||||
"{{ project_name }}.screen_ref_dir": "{{ screen_ref_dir }}", | |||||
"{{ project_name }}.dbmills_dir": "{{ dbmills_dir }}", | |||||
"{{ project_name }}.BIGcluster_config": "{{ BIGcluster_config }}", | |||||
"{{ project_name }}.fastq_screen_conf": "{{ fastq_screen_conf }}", | |||||
"{{ project_name }}.FASTQSCREENdocker": "{{ FASTQSCREENdocker }}", | |||||
"{{ project_name }}.SENTIEONdocker": "{{ SENTIEONdocker }}", | |||||
"{{ project_name }}.QUALIMAPdocker": "{{ QUALIMAPdocker }}", | |||||
"{{ project_name }}.db_mills": "{{ db_mills }}", | |||||
"{{ project_name }}.dbsnp": "{{ dbsnp }}", | |||||
"{{ project_name }}.MENDELIANdocker": "{{ MENDELIANdocker }}", | |||||
"{{ project_name }}.DIYdocker": "{{ DIYdocker }}", | |||||
"{{ project_name }}.ref_dir": "{{ ref_dir }}" | |||||
} | } |
export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa | export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa | ||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip ${vcf} -c > ${sample}.rtg.vcf.gz | |||||
cat ${vcf} | grep '#' > header | |||||
cat ${vcf} | grep -v '#' | grep -v '0/0' | grep -v '\./\.'| awk ' | |||||
BEGIN { OFS = "\t" } | |||||
{ | |||||
for ( i=9; i<=NF; i++ ) { | |||||
split($i,a,":") ;$i = a[1]; | |||||
} | |||||
} | |||||
{ print } | |||||
' > body | |||||
cat header body > filtered.vcf | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip filtered.vcf -c > ${sample}.rtg.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz | /opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz | ||||
if [[ ${sample} =~ "LCL5" ]];then | if [[ ${sample} =~ "LCL5" ]];then |
set -o pipefail | set -o pipefail | ||||
set -e | set -e | ||||
nt=$(nproc) | nt=$(nproc) | ||||
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=32G | |||||
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=60G | |||||
tar -zcvf ${bamname}_qualimap.zip ${bamname} | tar -zcvf ${bamname}_qualimap.zip ${bamname} | ||||
>>> | >>> | ||||
command <<< | command <<< | ||||
for i in ${sep=" " project_mendelian_summary} | for i in ${sep=" " project_mendelian_summary} | ||||
do | do | ||||
cat $i | sed -n '2,2p' >> mendelian.summary | |||||
cat $i | sed -n '2,3p' >> mendelian.summary | |||||
done | done | ||||
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > ${project}.mendelian.txt | sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > ${project}.mendelian.txt | ||||
String BENCHMARKdocker | String BENCHMARKdocker | ||||
String MENDELIANdocker | String MENDELIANdocker | ||||
String DIYdocker | String DIYdocker | ||||
String MULTIQCdocker | |||||
String fasta | String fasta | ||||
File ref_dir | File ref_dir | ||||
txt2=fastqscreen.txt2, | txt2=fastqscreen.txt2, | ||||
zip=qualimap.zip, | zip=qualimap.zip, | ||||
summary=benchmark.summary, | summary=benchmark.summary, | ||||
docker=MULTIQCdocker, | |||||
cluster_config=SMALLcluster_config, | cluster_config=SMALLcluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } |