浏览代码

benchmark

master
LUYAO REN 4 年前
父节点
当前提交
a57ffb135d
共有 8 个文件被更改,包括 76 次插入81 次删除
  1. +45
    -45
      codescripts/extract_tables.py
  2. +2
    -1
      defaults
  3. +1
    -0
      inputs
  4. +22
    -22
      tasks/benchmark.wdl
  5. +2
    -2
      tasks/extract_tables.wdl
  6. +0
    -6
      tasks/multiqc.wdl
  7. +1
    -1
      tasks/quartet_mendelian.wdl
  8. +3
    -4
      workflow.wdl

+ 45
- 45
codescripts/extract_tables.py 查看文件

@@ -13,7 +13,7 @@ parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.

parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)
#parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)

parser.add_argument('-project', '--project_name', type=str, help='project_name', required=True)

@@ -27,7 +27,7 @@ is_metrics_file = args.is_metrics

fastqc_file = args.fastqc
fastqscreen_file = args.fastqscreen
hap_file = args.happy
#hap_file = args.happy

project_name = args.project_name

@@ -107,49 +107,49 @@ post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)

#########################################
# variants calling
with open(hap_file) as hap_json:
happy = json.load(hap_json)
dat =pd.DataFrame.from_records(happy)
dat = dat.loc[:, dat.columns.str.endswith('ALL')]
dat_transposed = dat.T
dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
indel.reset_index(drop=True, inplace=True)
snv.reset_index(drop=True, inplace=True)
benchmark = pd.concat([snv, indel], axis=1)
benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
benchmark['SNV precision'] = benchmark['SNV precision'] * 100
benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
benchmark['SNV recall'] = benchmark['SNV recall'] * 100
benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
benchmark = benchmark.round(2)
benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
all_rep = [x.split('_')[6] for x in benchmark['Sample']]
if all_rep.count(all_rep[0]) == 4:
rep = list(set(all_rep))
columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD']
df_ = pd.DataFrame(columns=columns)
for i in rep:
string = "_" + i + "_"
sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')]
mean = list(sub_dat.mean(axis = 0, skipna = True))
sd = list(sub_dat.std(axis = 0, skipna = True))
family_name = project_name + "." + i + ".SNV"
df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True)
family_name = project_name + "." + i + ".INDEL"
df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True)
df_ = df_.round(2)
df_.to_csv('precision.recall.txt',sep="\t",index=0)
else:
pass
#with open(hap_file) as hap_json:
# happy = json.load(hap_json)
#dat =pd.DataFrame.from_records(happy)
#dat = dat.loc[:, dat.columns.str.endswith('ALL')]
#dat_transposed = dat.T
#dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
#indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
#snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
#indel.reset_index(drop=True, inplace=True)
#snv.reset_index(drop=True, inplace=True)
#benchmark = pd.concat([snv, indel], axis=1)
#benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
#benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
#benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
#benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
#benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
#benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
#benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
#benchmark['SNV precision'] = benchmark['SNV precision'] * 100
#benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
#benchmark['SNV recall'] = benchmark['SNV recall'] * 100
#benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
#benchmark = benchmark.round(2)
#benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
#all_rep = [x.split('_')[6] for x in benchmark['Sample']]
#if all_rep.count(all_rep[0]) == 4:
# rep = list(set(all_rep))
# columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD']
# df_ = pd.DataFrame(columns=columns)
# for i in rep:
# string = "_" + i + "_"
# sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')]
# mean = list(sub_dat.mean(axis = 0, skipna = True))
# sd = list(sub_dat.std(axis = 0, skipna = True))
# family_name = project_name + "." + i + ".SNV"
# df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True)
# family_name = project_name + "." + i + ".INDEL"
# df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True)
# df_ = df_.round(2)
# df_.to_csv('precision.recall.txt',sep="\t",index=0)
#else:
# pass




+ 2
- 1
defaults 查看文件

@@ -20,5 +20,6 @@
"dbsnp": "dbsnp_146.hg38.vcf",
"MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4",
"ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/"
"ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/",
"sdf": "oss://pgx-reference-data/GRCh38.d1.vd1/GRCh38.d1.vd1.sdf/"
}

+ 1
- 0
inputs 查看文件

@@ -15,6 +15,7 @@
"{{ project_name }}.dbmills_dir": "{{ dbmills_dir }}",
"{{ project_name }}.BIGcluster_config": "{{ BIGcluster_config }}",
"{{ project_name }}.fastq_screen_conf": "{{ fastq_screen_conf }}",
"{{ project_name }}.sdf": "{{ sdf }}",
"{{ project_name }}.FASTQSCREENdocker": "{{ FASTQSCREENdocker }}",
"{{ project_name }}.SENTIEONdocker": "{{ SENTIEONdocker }}",
"{{ project_name }}.QUALIMAPdocker": "{{ QUALIMAPdocker }}",

+ 22
- 22
tasks/benchmark.wdl 查看文件

@@ -1,22 +1,19 @@
task benchmark {
File vcf
File benchmarking_dir
File ref_dir
File sdf
String project
String sample = basename(vcf,".splited.vcf")
String fasta
String docker
String cluster_config
String disk_size


command <<<
set -o pipefail
set -e
nt=$(nproc)
mkdir -p /cromwell_root/tmp
cp -r ${ref_dir} /cromwell_root/tmp/

export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa
cp -r ${benchmarking_dir} /cromwell_root/tmp/

cat ${vcf} | grep '#' > header
cat ${vcf} | grep -v '#' | grep -v '0/0' | grep -v '\./\.'| awk '
@@ -33,17 +30,28 @@ task benchmark {
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip filtered.vcf -c > ${sample}.rtg.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${project}.${sample}.rtg.vcf.gz -o ${sample}.rtg.SNV.vcf.gz --snps-only
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${project}.${sample}.rtg.vcf.gz -o ${sample}.rtg.INDEL.vcf.gz --non-snps-only

if [[ ${sample} =~ "LCL5" ]];then
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL5.high.confidence.calls.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
elif [[ ${sample} =~ "LCL6" ]]; then
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL6.high.confidence.calls.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
elif [[ ${sample} =~ "LCL7" ]]; then
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL7.high.confidence.calls.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
elif [[ ${sample} =~ "LCL8" ]]; then
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL8.high.confidence.calls.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
else
echo "only for quartet samples"
fi

cat ${sample}_SNV/summary.txt > ${project}.${sample}_SNV_precision_recall.txt
cat ${sample}_INDEL/summary.txt > ${project}.${sample}_INDEL_precision_recall.txt

>>>

runtime {
@@ -54,17 +62,9 @@ task benchmark {
}

output {
File rtg_vcf = "${sample}.rtg.vcf.gz"
File rtg_vcf_index = "${sample}.rtg.vcf.gz.tbi"
File gzip_vcf = "${sample}.vcf.gz"
File gzip_vcf_index = "${sample}.vcf.gz.tbi"
File roc_all_csv = "${sample}.roc.all.csv.gz"
File roc_indel = "${sample}.roc.Locations.INDEL.csv.gz"
File roc_indel_pass = "${sample}.roc.Locations.INDEL.PASS.csv.gz"
File roc_snp = "${sample}.roc.Locations.SNP.csv.gz"
File roc_snp_pass = "${sample}.roc.Locations.SNP.PASS.csv.gz"
File summary = "${sample}.summary.csv"
File extended = "${sample}.extended.csv"
File metrics = "${sample}.metrics.json.gz"
File rtg_vcf = "${project}.${sample}.rtg.vcf.gz"
File rtg_vcf_index = "${project}.${sample}.rtg.vcf.gz.tbi"
File SNV_result = "${project}.${sample}_SNV_precision_recall.txt"
File Indel_result = "${project}.${sample}_INDEL_precision_recall.txt"
}
}

+ 2
- 2
tasks/extract_tables.wdl 查看文件

@@ -7,7 +7,7 @@ task extract_tables {

File fastqc
File fastqscreen
File hap

String project
String docker
@@ -16,7 +16,7 @@ task extract_tables {

command <<<

python /opt/extract_tables.py -quality ${quality_yield_summary} -depth ${wgs_metrics_summary} -aln ${aln_metrics_summary} -is ${is_metrics_summary} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap} -project ${project}
python /opt/extract_tables.py -quality ${quality_yield_summary} -depth ${wgs_metrics_summary} -aln ${aln_metrics_summary} -is ${is_metrics_summary} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -project ${project}

>>>


+ 0
- 6
tasks/multiqc.wdl 查看文件

@@ -6,8 +6,6 @@ task multiqc {
Array[File] txt1
Array[File] txt2

Array[File] summary

String docker
String cluster_config
String disk_size
@@ -17,9 +15,7 @@ task multiqc {
set -e
mkdir -p /cromwell_root/tmp/fastqc
mkdir -p /cromwell_root/tmp/fastqscreen
mkdir -p /cromwell_root/tmp/benchmark

cp ${sep=" " summary} /cromwell_root/tmp/benchmark
cp ${sep=" " read1_zip} ${sep=" " read2_zip} /cromwell_root/tmp/fastqc
cp ${sep=" " txt1} ${sep=" " txt2} /cromwell_root/tmp/fastqscreen

@@ -27,7 +23,6 @@ task multiqc {

cat multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt
cat multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt
cat multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json
>>>

@@ -43,6 +38,5 @@ task multiqc {
Array[File] multiqc_txt = glob("multiqc_data/*")
File fastqc = "multiqc_general_stats.txt"
File fastqscreen = "multiqc_fastq_screen.txt"
File hap = "multiqc_happy_data.json"
}
}

+ 1
- 1
tasks/quartet_mendelian.wdl 查看文件

@@ -10,7 +10,7 @@ task quartet_mendelian {
do
cat $i | sed -n '2,3p' >> mendelian.summary
done
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > ${project}.mendelian.txt
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > mendelian.txt

>>>


+ 3
- 4
workflow.wdl 查看文件

@@ -42,6 +42,7 @@ workflow {{ project_name }} {
String db_mills
File dbsnp_dir
String dbsnp
File sdf

File screen_ref_dir
File fastq_screen_conf
@@ -203,8 +204,8 @@ workflow {{ project_name }} {
input:
vcf=single_gvcf[idx],
benchmarking_dir=benchmarking_dir,
ref_dir=ref_dir,
fasta=fasta,
sdf=sdf,
project=project,
docker=BENCHMARKdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size,
@@ -218,7 +219,6 @@ workflow {{ project_name }} {
read2_zip=fastqc.read2_zip,
txt1=fastqscreen.txt1,
txt2=fastqscreen.txt2,
summary=benchmark.summary,
docker=MULTIQCdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size
@@ -248,7 +248,6 @@ workflow {{ project_name }} {
is_metrics_summary=merge_sentieon_metrics.is_metrics_summary,
fastqc=multiqc.fastqc,
fastqscreen=multiqc.fastqscreen,
hap=multiqc.hap,
project=project,
docker=DIYdocker,
cluster_config=SMALLcluster_config,

正在加载...
取消
保存