Bläddra i källkod

default

master
LUYAO REN 4 år sedan
förälder
incheckning
a4d4e774f3
9 ändrade filer med 169 tillägg och 89 borttagningar
  1. +37
    -3
      codescripts/D5_D6.py
  2. +0
    -36
      codescripts/get_filtered_benchmark_vcfinfo.py
  3. +71
    -26
      codescripts/merge_two_family_with_genotype.py
  4. +23
    -0
      defaults
  5. +21
    -21
      inputs
  6. +13
    -1
      tasks/benchmark.wdl
  7. +1
    -1
      tasks/qualimap.wdl
  8. +1
    -1
      tasks/quartet_mendelian.wdl
  9. +2
    -0
      workflow.wdl

+ 37
- 3
codescripts/D5_D6.py Visa fil

@@ -22,10 +22,29 @@ output_file = open(output_name,'w')
# input files
sister_dat = pd.read_table(sister_file)

sister_same = 0
sister_diff = 0
indel_sister_same = 0
indel_sister_diff = 0
snv_sister_same = 0
snv_sister_diff = 0

for row in sister_dat.itertuples():
# snv indel
if ',' in row.ALT:
alt = row.ALT.split(',')
alt_len = [len(i) for i in alt]
alt_max = max(alt_len)
else:
alt_max = len(row.ALT)
alt = alt_max
ref = row.REF
if len(ref) == 1 and alt == 1:
cate = 'SNV'
elif len(ref) > alt:
cate = 'INDEL'
elif alt > len(ref):
cate = 'INDEL'
elif len(ref) == alt:
cate = 'INDEL'
# sister
if row[5] == row[6]:
if row[5] == './.':
@@ -49,8 +68,23 @@ for row in sister_dat.itertuples():
sister_diff += 1
else:
pass
if cate == 'SNV':
if sister_count == 'yes_same':
snv_sister_same += 1
elif sister_count == 'yes_diff':
snv_sister_diff += 1
else:
pass
elif cate == 'INDEL':
if sister_count == 'yes_same':
indel_sister_same += 1
elif sister_count == 'yes_diff':
indel_sister_diff += 1
else:
pass

sister = sister_same/(sister_same + sister_diff)
indel_sister = indel_sister_same/(indel_sister_same + indel_sister_diff)
snv_sister = snv_sister_same/(snv_sister_same + snv_sister_diff)
outcolumn = 'Project\tReproducibility_D5_D6\n'
outResult = project_name + '\t' + str(sister) + '\n'
output_file.write(outcolumn)

+ 0
- 36
codescripts/get_filtered_benchmark_vcfinfo.py Visa fil

@@ -1,36 +0,0 @@
from __future__ import division
import pandas as pd
import sys, argparse, os
import fileinput


# input arguments
parser = argparse.ArgumentParser(description="this script is to get filtered and benchmark vcf info")

parser.add_argument('-filtered', '--filtered', type=str, help='filtered position', required=True)
parser.add_argument('-benchmark', '--benchmark', type=str, help='benchmark position', required=True)
parser.add_argument('-vcf', '--vcf', type=str, help='one specific vcf', required=True)
parser.add_argument('-filename', '--filename', type=str, help='output file name', required=True)


args = parser.parse_args()
filtered = args.filtered
benchmark = args.benchmark
vcf = args.vcf
filename = args.filename


# output file
filtered_filename = filename + '.filtered.txt'
benchmark_filename = filename + '.benchmark.txt'

# input files
filtered_dat = pd.read_table(filtered,header=None)
benchmark_dat = pd.read_table(benchmark,header=None)
vcf_dat = pd.read_table(vcf)

filtered_merged_df = pd.merge(filtered_dat, vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS'])
benchmark_merged_df = pd.merge(benchmark_dat,vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS'])

filtered_merged_df.to_csv(filtered_filename,sep='\t',index=False)
benchmark_merged_df.to_csv(benchmark_filename,sep='\t',index=False)

+ 71
- 26
codescripts/merge_two_family_with_genotype.py Visa fil

@@ -39,12 +39,35 @@ merged_genotype_df = pd.merge(merged_df, genotype_dat, how='outer', left_on=['#
merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,23,24,29,30,31,32,7,17]]
merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6']

sister_same = 0
sister_diff = 0
family_all = 0
family_mendelian = 0
indel_sister_same = 0
indel_sister_diff = 0
indel_family_all = 0
indel_family_mendelian = 0

snv_sister_same = 0
snv_sister_diff = 0
snv_family_all = 0
snv_family_mendelian = 0


for row in merged_genotype_df_sub.itertuples():
# indel or snv
if ',' in row.ALT:
alt = row.ALT.split(',')
alt_len = [len(i) for i in alt]
alt_max = max(alt_len)
else:
alt_max = len(row.ALT)
alt = alt_max
ref = row.REF
if len(ref) == 1 and alt == 1:
cate = 'SNV'
elif len(ref) > alt:
cate = 'INDEL'
elif alt > len(ref):
cate = 'INDEL'
elif len(ref) == alt:
cate = 'INDEL'
# sister
if row.LCL5 == row.LCL6:
if row.LCL5 == './.':
@@ -61,13 +84,7 @@ for row in merged_genotype_df_sub.itertuples():
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'):
sister_count = "no"
else:
sister_count = "yes_diff"
if sister_count == 'yes_same':
sister_same += 1
elif sister_count == 'yes_diff':
sister_diff += 1
else:
pass
sister_count = "yes_diff"
# family trio5
if row.LCL5 == row. LCL7 == row.LCL8 == './.':
mendelian = mendelian + ':noInfo'
@@ -91,25 +108,53 @@ for row in merged_genotype_df_sub.itertuples():
mendelian_count = "no"
else:
mendelian_count = "yes"
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n'
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + cate + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n'
family_file.write(outline)
if mendelian_count == 'yes':
family_all += 1
else:
pass
if mendelian == '1:1:1':
family_mendelian += 1
elif mendelian == 'Ref:1:1':
family_mendelian += 1
else:
pass
if cate == 'SNV':
if sister_count == 'yes_same':
snv_sister_same += 1
elif sister_count == 'yes_diff':
snv_sister_diff += 1
else:
pass
if mendelian_count == 'yes':
snv_family_all += 1
else:
pass
if mendelian == '1:1:1':
snv_family_mendelian += 1
elif mendelian == 'Ref:1:1':
snv_family_mendelian += 1
else:
pass
elif cate == 'INDEL':
if sister_count == 'yes_same':
indel_sister_same += 1
elif sister_count == 'yes_diff':
indel_sister_diff += 1
else:
pass
if mendelian_count == 'yes':
indel_family_all += 1
else:
pass
if mendelian == '1:1:1':
indel_family_mendelian += 1
elif mendelian == 'Ref:1:1':
indel_family_mendelian += 1
else:
pass

sister = sister_same/(sister_same + sister_diff)
quartet = family_mendelian/family_all
snv_sister = snv_sister_same/(snv_sister_same + snv_sister_diff)
indel_sister = indel_sister_same/(indel_sister_same + indel_sister_diff)
snv_quartet = snv_family_mendelian/snv_family_all
indel_quartet = indel_family_mendelian/indel_family_all
outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n'
outResult = family + '\t' + str(sister) + '\t' + str(quartet) + '\n'
indel_outResult = family + '_INDEL' + '\t' + str(indel_sister) + '\t' + str(indel_quartet) + '\n'
snv_outResult = family + '_SNV' + '\t' + str(snv_sister) + '\t' + str(snv_quartet) + '\n'
summary_file.write(outcolumn)
summary_file.write(outResult)
summary_file.write(indel_outResult)
summary_file.write(snv_outResult)




+ 23
- 0
defaults Visa fil

@@ -0,0 +1,23 @@
{
"benchmarking_dir": "oss://pgx-result/renluyao/manuscript/benchmark_calls_v3.0/",
"SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics",
"fasta": "GRCh38.d1.vd1.fa",
"BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest",
"dbsnp_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/",
"disk_size": "500",
"FASTQCdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:v0.11.5",
"SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc",
"screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/",
"dbmills_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/",
"BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc",
"fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf",
"MULTIQCdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/multiqc:v1.8",
"FASTQSCREENdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0",
"SENTIEONdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/sentieon-genomics:v2018.08.01",
"QUALIMAPdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/qualimap:2.0.0",
"db_mills": "Mills_and_1000G_gold_standard.indels.hg38.vcf",
"dbsnp": "dbsnp_146.hg38.vcf",
"MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4",
"ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/"
}

+ 21
- 21
inputs Visa fil

@@ -1,25 +1,25 @@
{
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript/benchmark_calls_v3.0/",
"{{ project_name }}.SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics",
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest",
"{{ project_name }}.dbsnp_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/",
"{{ project_name }}.disk_size": "500",
"{{ project_name }}.benchmarking_dir": "{{ benchmarking_dir }}",
"{{ project_name }}.SENTIEON_INSTALL_DIR": "{{ SENTIEON_INSTALL_DIR }}",
"{{ project_name }}.fasta": "{{ fasta }}",
"{{ project_name }}.BENCHMARKdocker": "{{ BENCHMARKdocker }}",
"{{ project_name }}.dbsnp_dir": "{{ dbsnp_dir }}",
"{{ project_name }}.disk_size": "{{ disk_size }}",
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}",
"{{ project_name }}.FASTQCdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:v0.11.5",
"{{ project_name }}.FASTQCdocker": "{{ FASTQCdocker }}",
"{{ project_name }}.MULTIQCdocker": "{{ MULTIQCdocker }}",
"{{ project_name }}.project": "{{ project }}",
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc",
"{{ project_name }}.screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/",
"{{ project_name }}.dbmills_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/",
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc",
"{{ project_name }}.fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf",
"{{ project_name }}.multiqc.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/multiqc:v1.8",
"{{ project_name }}.FASTQSCREENdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0",
"{{ project_name }}.SENTIEONdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/sentieon-genomics:v2018.08.01",
"{{ project_name }}.QUALIMAPdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/qualimap:2.0.0",
"{{ project_name }}.db_mills": "Mills_and_1000G_gold_standard.indels.hg38.vcf",
"{{ project_name }}.dbsnp": "dbsnp_146.hg38.vcf",
"{{ project_name }}.MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4",
"{{ project_name }}.ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/"
"{{ project_name }}.SMALLcluster_config": "{{ SMALLcluster_config }}",
"{{ project_name }}.screen_ref_dir": "{{ screen_ref_dir }}",
"{{ project_name }}.dbmills_dir": "{{ dbmills_dir }}",
"{{ project_name }}.BIGcluster_config": "{{ BIGcluster_config }}",
"{{ project_name }}.fastq_screen_conf": "{{ fastq_screen_conf }}",
"{{ project_name }}.FASTQSCREENdocker": "{{ FASTQSCREENdocker }}",
"{{ project_name }}.SENTIEONdocker": "{{ SENTIEONdocker }}",
"{{ project_name }}.QUALIMAPdocker": "{{ QUALIMAPdocker }}",
"{{ project_name }}.db_mills": "{{ db_mills }}",
"{{ project_name }}.dbsnp": "{{ dbsnp }}",
"{{ project_name }}.MENDELIANdocker": "{{ MENDELIANdocker }}",
"{{ project_name }}.DIYdocker": "{{ DIYdocker }}",
"{{ project_name }}.ref_dir": "{{ ref_dir }}"
}

+ 13
- 1
tasks/benchmark.wdl Visa fil

@@ -18,7 +18,19 @@ task benchmark {

export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip ${vcf} -c > ${sample}.rtg.vcf.gz
cat ${vcf} | grep '#' > header
cat ${vcf} | grep -v '#' | grep -v '0/0' | grep -v '\./\.'| awk '
BEGIN { OFS = "\t" }
{
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[1];
}
}
{ print }
' > body
cat header body > filtered.vcf

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip filtered.vcf -c > ${sample}.rtg.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz

if [[ ${sample} =~ "LCL5" ]];then

+ 1
- 1
tasks/qualimap.wdl Visa fil

@@ -10,7 +10,7 @@ task qualimap {
set -o pipefail
set -e
nt=$(nproc)
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=32G
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=60G
tar -zcvf ${bamname}_qualimap.zip ${bamname}
>>>


+ 1
- 1
tasks/quartet_mendelian.wdl Visa fil

@@ -8,7 +8,7 @@ task quartet_mendelian {
command <<<
for i in ${sep=" " project_mendelian_summary}
do
cat $i | sed -n '2,2p' >> mendelian.summary
cat $i | sed -n '2,3p' >> mendelian.summary
done
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > ${project}.mendelian.txt


+ 2
- 0
workflow.wdl Visa fil

@@ -33,6 +33,7 @@ workflow {{ project_name }} {
String BENCHMARKdocker
String MENDELIANdocker
String DIYdocker
String MULTIQCdocker

String fasta
File ref_dir
@@ -225,6 +226,7 @@ workflow {{ project_name }} {
txt2=fastqscreen.txt2,
zip=qualimap.zip,
summary=benchmark.summary,
docker=MULTIQCdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size
}

Laddar…
Avbryt
Spara