@@ -13,7 +13,7 @@ parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics. | |||
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True) | |||
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) | |||
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) | |||
#parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) | |||
parser.add_argument('-project', '--project_name', type=str, help='project_name', required=True) | |||
@@ -27,7 +27,7 @@ is_metrics_file = args.is_metrics | |||
fastqc_file = args.fastqc | |||
fastqscreen_file = args.fastqscreen | |||
hap_file = args.happy | |||
#hap_file = args.happy | |||
project_name = args.project_name | |||
@@ -107,49 +107,49 @@ post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0) | |||
######################################### | |||
# variants calling | |||
with open(hap_file) as hap_json: | |||
happy = json.load(hap_json) | |||
dat =pd.DataFrame.from_records(happy) | |||
dat = dat.loc[:, dat.columns.str.endswith('ALL')] | |||
dat_transposed = dat.T | |||
dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']] | |||
indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]] | |||
snv = dat_transposed[['SNP' in s for s in dat_transposed.index]] | |||
indel.reset_index(drop=True, inplace=True) | |||
snv.reset_index(drop=True, inplace=True) | |||
benchmark = pd.concat([snv, indel], axis=1) | |||
benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']] | |||
benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall'] | |||
benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']] | |||
benchmark['SNV precision'] = benchmark['SNV precision'].astype(float) | |||
benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float) | |||
benchmark['SNV recall'] = benchmark['SNV recall'].astype(float) | |||
benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float) | |||
benchmark['SNV precision'] = benchmark['SNV precision'] * 100 | |||
benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100 | |||
benchmark['SNV recall'] = benchmark['SNV recall'] * 100 | |||
benchmark['INDEL recall'] = benchmark['INDEL recall']* 100 | |||
benchmark = benchmark.round(2) | |||
benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0) | |||
all_rep = [x.split('_')[6] for x in benchmark['Sample']] | |||
if all_rep.count(all_rep[0]) == 4: | |||
rep = list(set(all_rep)) | |||
columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD'] | |||
df_ = pd.DataFrame(columns=columns) | |||
for i in rep: | |||
string = "_" + i + "_" | |||
sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')] | |||
mean = list(sub_dat.mean(axis = 0, skipna = True)) | |||
sd = list(sub_dat.std(axis = 0, skipna = True)) | |||
family_name = project_name + "." + i + ".SNV" | |||
df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True) | |||
family_name = project_name + "." + i + ".INDEL" | |||
df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True) | |||
df_ = df_.round(2) | |||
df_.to_csv('precision.recall.txt',sep="\t",index=0) | |||
else: | |||
pass | |||
#with open(hap_file) as hap_json: | |||
# happy = json.load(hap_json) | |||
#dat =pd.DataFrame.from_records(happy) | |||
#dat = dat.loc[:, dat.columns.str.endswith('ALL')] | |||
#dat_transposed = dat.T | |||
#dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']] | |||
#indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]] | |||
#snv = dat_transposed[['SNP' in s for s in dat_transposed.index]] | |||
#indel.reset_index(drop=True, inplace=True) | |||
#snv.reset_index(drop=True, inplace=True) | |||
#benchmark = pd.concat([snv, indel], axis=1) | |||
#benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']] | |||
#benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall'] | |||
#benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']] | |||
#benchmark['SNV precision'] = benchmark['SNV precision'].astype(float) | |||
#benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float) | |||
#benchmark['SNV recall'] = benchmark['SNV recall'].astype(float) | |||
#benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float) | |||
#benchmark['SNV precision'] = benchmark['SNV precision'] * 100 | |||
#benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100 | |||
#benchmark['SNV recall'] = benchmark['SNV recall'] * 100 | |||
#benchmark['INDEL recall'] = benchmark['INDEL recall']* 100 | |||
#benchmark = benchmark.round(2) | |||
#benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0) | |||
#all_rep = [x.split('_')[6] for x in benchmark['Sample']] | |||
#if all_rep.count(all_rep[0]) == 4: | |||
# rep = list(set(all_rep)) | |||
# columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD'] | |||
# df_ = pd.DataFrame(columns=columns) | |||
# for i in rep: | |||
# string = "_" + i + "_" | |||
# sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')] | |||
# mean = list(sub_dat.mean(axis = 0, skipna = True)) | |||
# sd = list(sub_dat.std(axis = 0, skipna = True)) | |||
# family_name = project_name + "." + i + ".SNV" | |||
# df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True) | |||
# family_name = project_name + "." + i + ".INDEL" | |||
# df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True) | |||
# df_ = df_.round(2) | |||
# df_.to_csv('precision.recall.txt',sep="\t",index=0) | |||
#else: | |||
# pass | |||
@@ -20,5 +20,6 @@ | |||
"dbsnp": "dbsnp_146.hg38.vcf", | |||
"MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||
"ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/" | |||
"ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/", | |||
"sdf": "oss://pgx-reference-data/GRCh38.d1.vd1/GRCh38.d1.vd1.sdf/" | |||
} |
@@ -15,6 +15,7 @@ | |||
"{{ project_name }}.dbmills_dir": "{{ dbmills_dir }}", | |||
"{{ project_name }}.BIGcluster_config": "{{ BIGcluster_config }}", | |||
"{{ project_name }}.fastq_screen_conf": "{{ fastq_screen_conf }}", | |||
"{{ project_name }}.sdf": "{{ sdf }}", | |||
"{{ project_name }}.FASTQSCREENdocker": "{{ FASTQSCREENdocker }}", | |||
"{{ project_name }}.SENTIEONdocker": "{{ SENTIEONdocker }}", | |||
"{{ project_name }}.QUALIMAPdocker": "{{ QUALIMAPdocker }}", |
@@ -1,22 +1,19 @@ | |||
task benchmark { | |||
File vcf | |||
File benchmarking_dir | |||
File ref_dir | |||
File sdf | |||
String project | |||
String sample = basename(vcf,".splited.vcf") | |||
String fasta | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
set -o pipefail | |||
set -e | |||
nt=$(nproc) | |||
mkdir -p /cromwell_root/tmp | |||
cp -r ${ref_dir} /cromwell_root/tmp/ | |||
export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa | |||
cp -r ${benchmarking_dir} /cromwell_root/tmp/ | |||
cat ${vcf} | grep '#' > header | |||
cat ${vcf} | grep -v '#' | grep -v '0/0' | grep -v '\./\.'| awk ' | |||
@@ -33,17 +30,28 @@ task benchmark { | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip filtered.vcf -c > ${sample}.rtg.vcf.gz | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.rtg.vcf.gz | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${project}.${sample}.rtg.vcf.gz -o ${sample}.rtg.SNV.vcf.gz --snps-only | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${project}.${sample}.rtg.vcf.gz -o ${sample}.rtg.INDEL.vcf.gz --non-snps-only | |||
if [[ ${sample} =~ "LCL5" ]];then | |||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL5.high.confidence.calls.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||
elif [[ ${sample} =~ "LCL6" ]]; then | |||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL6.high.confidence.calls.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||
elif [[ ${sample} =~ "LCL7" ]]; then | |||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL7.high.confidence.calls.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||
elif [[ ${sample} =~ "LCL8" ]]; then | |||
/opt/hap.py/bin/hap.py ${benchmarking_dir}/LCL8.high.confidence.calls.vcf.gz ${sample}.rtg.vcf.gz -f ${benchmarking_dir}/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||
else | |||
echo "only for quartet samples" | |||
fi | |||
cat ${sample}_SNV/summary.txt > ${project}.${sample}_SNV_precision_recall.txt | |||
cat ${sample}_INDEL/summary.txt > ${project}.${sample}_INDEL_precision_recall.txt | |||
>>> | |||
runtime { | |||
@@ -54,17 +62,9 @@ task benchmark { | |||
} | |||
output { | |||
File rtg_vcf = "${sample}.rtg.vcf.gz" | |||
File rtg_vcf_index = "${sample}.rtg.vcf.gz.tbi" | |||
File gzip_vcf = "${sample}.vcf.gz" | |||
File gzip_vcf_index = "${sample}.vcf.gz.tbi" | |||
File roc_all_csv = "${sample}.roc.all.csv.gz" | |||
File roc_indel = "${sample}.roc.Locations.INDEL.csv.gz" | |||
File roc_indel_pass = "${sample}.roc.Locations.INDEL.PASS.csv.gz" | |||
File roc_snp = "${sample}.roc.Locations.SNP.csv.gz" | |||
File roc_snp_pass = "${sample}.roc.Locations.SNP.PASS.csv.gz" | |||
File summary = "${sample}.summary.csv" | |||
File extended = "${sample}.extended.csv" | |||
File metrics = "${sample}.metrics.json.gz" | |||
File rtg_vcf = "${project}.${sample}.rtg.vcf.gz" | |||
File rtg_vcf_index = "${project}.${sample}.rtg.vcf.gz.tbi" | |||
File SNV_result = "${project}.${sample}_SNV_precision_recall.txt" | |||
File Indel_result = "${project}.${sample}_INDEL_precision_recall.txt" | |||
} | |||
} |
@@ -7,7 +7,7 @@ task extract_tables { | |||
File fastqc | |||
File fastqscreen | |||
File hap | |||
String project | |||
String docker | |||
@@ -16,7 +16,7 @@ task extract_tables { | |||
command <<< | |||
python /opt/extract_tables.py -quality ${quality_yield_summary} -depth ${wgs_metrics_summary} -aln ${aln_metrics_summary} -is ${is_metrics_summary} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap} -project ${project} | |||
python /opt/extract_tables.py -quality ${quality_yield_summary} -depth ${wgs_metrics_summary} -aln ${aln_metrics_summary} -is ${is_metrics_summary} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -project ${project} | |||
>>> | |||
@@ -6,8 +6,6 @@ task multiqc { | |||
Array[File] txt1 | |||
Array[File] txt2 | |||
Array[File] summary | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
@@ -17,9 +15,7 @@ task multiqc { | |||
set -e | |||
mkdir -p /cromwell_root/tmp/fastqc | |||
mkdir -p /cromwell_root/tmp/fastqscreen | |||
mkdir -p /cromwell_root/tmp/benchmark | |||
cp ${sep=" " summary} /cromwell_root/tmp/benchmark | |||
cp ${sep=" " read1_zip} ${sep=" " read2_zip} /cromwell_root/tmp/fastqc | |||
cp ${sep=" " txt1} ${sep=" " txt2} /cromwell_root/tmp/fastqscreen | |||
@@ -27,7 +23,6 @@ task multiqc { | |||
cat multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt | |||
cat multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt | |||
cat multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json | |||
>>> | |||
@@ -43,6 +38,5 @@ task multiqc { | |||
Array[File] multiqc_txt = glob("multiqc_data/*") | |||
File fastqc = "multiqc_general_stats.txt" | |||
File fastqscreen = "multiqc_fastq_screen.txt" | |||
File hap = "multiqc_happy_data.json" | |||
} | |||
} |
@@ -10,7 +10,7 @@ task quartet_mendelian { | |||
do | |||
cat $i | sed -n '2,3p' >> mendelian.summary | |||
done | |||
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > ${project}.mendelian.txt | |||
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > mendelian.txt | |||
>>> | |||
@@ -42,6 +42,7 @@ workflow {{ project_name }} { | |||
String db_mills | |||
File dbsnp_dir | |||
String dbsnp | |||
File sdf | |||
File screen_ref_dir | |||
File fastq_screen_conf | |||
@@ -203,8 +204,8 @@ workflow {{ project_name }} { | |||
input: | |||
vcf=single_gvcf[idx], | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
sdf=sdf, | |||
project=project, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
@@ -218,7 +219,6 @@ workflow {{ project_name }} { | |||
read2_zip=fastqc.read2_zip, | |||
txt1=fastqscreen.txt1, | |||
txt2=fastqscreen.txt2, | |||
summary=benchmark.summary, | |||
docker=MULTIQCdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
@@ -248,7 +248,6 @@ workflow {{ project_name }} { | |||
is_metrics_summary=merge_sentieon_metrics.is_metrics_summary, | |||
fastqc=multiqc.fastqc, | |||
fastqscreen=multiqc.fastqscreen, | |||
hap=multiqc.hap, | |||
project=project, | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, |