ソースを参照

merge sentieon

master
LUYAO REN 4年前
コミット
cad4baa33d
7個のファイルの変更228行の追加67行の削除
  1. +113
    -0
      codescripts/extract_tables.py
  2. +8
    -4
      tasks/extract_tables.wdl
  3. +0
    -43
      tasks/mergeSentieon.wdl
  4. +1
    -1
      tasks/merge_family.wdl
  5. +48
    -0
      tasks/merge_sentieon_metrics.wdl
  6. +21
    -14
      tasks/sentieon.wdl
  7. +37
    -5
      workflow.wdl

+ 113
- 0
codescripts/extract_tables.py ファイルの表示

import json
import pandas as pd
import sys, argparse, os

parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")


parser.add_argument('-quality', '--quality_yield', type=str, help='*.quality_yield.txt', required=True)
parser.add_argument('-depth', '--wgs_metrics', type=str, help='*deduped_WgsMetricsAlgo.txt', required=True)
parser.add_argument('-aln', '--aln_metrics', type=str, help='*_deduped_aln_metrics.txt', required=True)
parser.add_argument('-is', '--is_metrics', type=str, help='*_deduped_is_metrics.txt', required=True)

parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)

args = parser.parse_args()

# Rename input:
quality_yield_file = args.quality_yield
wgs_metrics_file = args.wgs_metrics
aln_metrics_file = args.aln_metrics
is_metrics_file = args.is_metrics

fastqc_file = args.fastqc
fastqscreen_file = args.fastqscreen
hap_file = args.happy

#############################################
# fastqc
fastqc = pd.read_table(fastqc_file)

#fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
#fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
#fastqc_stat = fastqc.dropna()

# qulimap
#qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
#qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
#qualimap_stat = qualimap.dropna()

# fastqc
#dat = pd.read_table(fastqc_file)

#fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
#fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
#fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])

# fastqscreen
dat = pd.read_table(fastqscreen_file)
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])

# pre-alignment
pre_alignment_dat = pd.merge(fastqc,fastqscreen,how="outer",left_on=['Sample'],right_on=['Sample'])
pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences'] = pre_alignment_dat['FastQC_mqc-generalstats-fastqc-total_sequences']/1000000
del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-percent_fails']
del pre_alignment_dat['FastQC_mqc-generalstats-fastqc-avg_sequence_length']
del pre_alignment_dat['ERCC percentage']
del pre_alignment_dat['Phix percentage']
del pre_alignment_dat['Mouse percentage']
pre_alignment_dat = pre_alignment_dat.round(2)
pre_alignment_dat.columns = ['Sample','%Dup','%GC','Total Sequences (million)','%Human','%EColi','%Adapter','%Vector','%rRNA','%Virus','%Yeast','%Mitoch','%No hits']
pre_alignment_dat.to_csv('pre_alignment.txt',sep="\t",index=0)



############################
dat = pd.read_table(aln_metrics_file)
dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
aln_metrics = aln_metrics * 100

dat = pd.read_table(is_metrics_file)
is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]

dat = pd.read_table(quality_yield_file)
dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
quality_yield = dat[['Sample','%Q20','%Q30']]
quality_yield = quality_yield * 100

dat = pd.read_table(wgs_metrics_file)
wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']]
wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100

data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics]
post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)

# benchmark
with open(hap_file) as hap_json:
happy = json.load(hap_json)
dat =pd.DataFrame.from_records(happy)
dat = dat.loc[:, dat.columns.str.endswith('ALL')]
dat_transposed = dat.T
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
benchmark.columns = ['Sample','Precision','Recall']

#output
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)







+ 8
- 4
tasks/extract_tables.wdl ファイルの表示

task extract_tables { task extract_tables {


File quality_yield_summary
File wgs_metrics_summary
File aln_metrics_summary
File is_metrics_summary

File fastqc File fastqc
File fastqscreen File fastqscreen
File hap File hap
File aln
File quality_yield
File wgs_metrics


String docker String docker
String cluster_config String cluster_config
String disk_size String disk_size


command <<< command <<<
python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap}



>>> >>>


runtime { runtime {

+ 0
- 43
tasks/mergeSentieon.wdl ファイルの表示

task mergeSentieon {
Array[File] aln_metrics_header
Array[File] aln_metrics_data

Array[File] is_metrics_header
Array[File] is_metrics_data

Array[File] quality_yield_header
Array[File] quality_yield_data

Array[File] wgs_metrics_header
Array[File] wgs_metrics_data

String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
echo '''Sample''' > sample_column
cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics
ls ${sep=" " aln_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - aln_metrics > aln_metrics.txt
cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics.txt
cat ${sep=" " quality_yield_header} | sed -n '1,1p' | cat - ${sep=" " quality_yield_data} > quality_yield_data.txt
cat ${sep=" " wgs_metrics_header} | sed -n '1,1p' | cat - ${sep=" " wgs_metrics_data} > wgs_metrics_data.txt
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File aln_metrics_merge = "aln_metrics.txt"
File is_metrics_merge = "is_metrics.txt"
File quality_yield_merge = "quality_yield_data.txt"
File wgs_metrics_merge = "wgs_metrics_data.txt"
}
}

+ 1
- 1
tasks/merge_family.wdl ファイルの表示

task merge_family {
task merge_family {
Array[File] splited_vcf Array[File] splited_vcf
String project String project
String docker String docker

+ 48
- 0
tasks/merge_sentieon_metrics.wdl ファイルの表示

task merge_sentieon_metrics {
Array[File] quality_yield_header
Array[File] wgs_metrics_algo_header
Array[File] aln_metrics_header
Array[File] is_metrics_header

Array[File] quality_yield_data
Array[File] wgs_metrics_algo_data
Array[File] aln_metrics_data
Array[File] is_metrics_data

String project
String docker
String cluster_config
String disk_size
command <<<
echo '''Sample''' > sample_column

cat ${sep=" " quality_yield_header} | sed -n '1,1p' | cat - ${sep=" " quality_yield_data} > quality_yield_all
ls ${sep=" " quality_yield_data} | cut -d '.' -f1 | cat sample_column - | paste - quality_yield_all > ${project}.quality_yield.txt

cat ${sep=" " wgs_metrics_algo_header} | sed -n '1,1p' | cat - ${sep=" " wgs_metrics_algo_data} > wgs_metrics_all
ls ${sep=" " wgs_metrics_algo_data} | cut -d '.' -f1 | cat sample_column - | paste - wgs_metrics_all > ${project}.wgs_metrics_data.txt
cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics_all
ls ${sep=" " aln_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - aln_metrics_all > ${project}.aln_metrics.txt


cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics_all
ls ${sep=" " is_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - is_metrics_all > ${project}.is_metrics.txt

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File quality_yield_summary = "${project}.quality_yield.txt"
File wgs_metrics_summary = "${project}.wgs_metrics_data.txt"
File aln_metrics_summary = "${project}.aln_metrics.txt"
File is_metrics_summary = "${project}.is_metrics.txt"
}
}

+ 21
- 14
tasks/sentieon.wdl ファイルの表示

task sentieon { task sentieon {
File quality_yield
File wgs_metrics_algo
File aln_metrics File aln_metrics
File is_metrics File is_metrics
File wgsmetrics
File quality_yield
String sample
String docker String docker
String cluster_config String cluster_config
String disk_size String disk_size
command <<< command <<<
set -o pipefail set -o pipefail
set -e set -e

cat ${quality_yield} | sed -n '2,2p' > quality_yield.header
cat ${quality_yield} | sed -n '3,3p' > ${sample}.quality_yield

cat ${wgs_metrics_algo} | sed -n '2,2p' > wgs_metrics_algo.header
cat ${wgs_metrics_algo} | sed -n '3,3p' > ${sample}.wgs_metrics_algo

cat ${aln_metrics} | sed -n '2,2p' > aln_metrics.header cat ${aln_metrics} | sed -n '2,2p' > aln_metrics.header
cat ${aln_metrics} | sed -n '5,5p' > ${sample_name}.aln_metrics
cat ${dedup_metrics} | sed -n '2,2p' > dedup_metrics.header
cat ${dedup_metrics} | sed -n '3,3p' > ${sample_name}.dedup_metrics
cat ${aln_metrics} | sed -n '5,5p' > ${sample}.aln_metrics

cat ${is_metrics} | sed -n '2,2p' > is_metrics.header cat ${is_metrics} | sed -n '2,2p' > is_metrics.header
cat ${is_metrics} | sed -n '3,3p' > ${sample_name}.is_metrics
cat ${deduped_coverage} | sed -n '1,1p' > deduped_coverage.header
cat ${deduped_coverage} | sed -n '2,2p' > ${sample_name}.deduped_coverage
cat ${is_metrics} | sed -n '3,3p' > ${sample}.is_metrics

>>> >>>


runtime { runtime {
} }


output { output {
File quality_yield_header = "quality_yield.header"
File quality_yield_data = "${sample}.quality_yield"
File wgs_metrics_algo_header = "wgs_metrics_algo.header"
File wgs_metrics_algo_data = "${sample}.wgs_metrics_algo"
File aln_metrics_header = "aln_metrics.header" File aln_metrics_header = "aln_metrics.header"
File aln_metrics_data = "${sample_name}.aln_metrics"
File dedup_metrics_header = "dedup_metrics.header"
File dedup_metrics_data = "${sample_name}.dedup_metrics"
File aln_metrics_data = "${sample}.aln_metrics"
File is_metrics_header = "is_metrics.header" File is_metrics_header = "is_metrics.header"
File is_metrics_data = "${sample_name}.is_metrics"
File deduped_coverage_header = "deduped_coverage.header"
File deduped_coverage_data = "${sample_name}.deduped_coverage"
File is_metrics_data = "${sample}.is_metrics"
} }
} }

+ 37
- 5
workflow.wdl ファイルの表示

import "./tasks/mapping.wdl" as mapping import "./tasks/mapping.wdl" as mapping
import "./tasks/Dedup.wdl" as Dedup import "./tasks/Dedup.wdl" as Dedup
import "./tasks/deduped_Metrics.wdl" as deduped_Metrics import "./tasks/deduped_Metrics.wdl" as deduped_Metrics
import "./tasks/sentieon.wdl" as sentieon
import "./tasks/Realigner.wdl" as Realigner import "./tasks/Realigner.wdl" as Realigner
import "./tasks/BQSR.wdl" as BQSR import "./tasks/BQSR.wdl" as BQSR
import "./tasks/Haplotyper_gVCF.wdl" as Haplotyper_gVCF import "./tasks/Haplotyper_gVCF.wdl" as Haplotyper_gVCF
import "./tasks/split_gvcf_files.wdl" as split_gvcf_files import "./tasks/split_gvcf_files.wdl" as split_gvcf_files
import "./tasks/benchmark.wdl" as benchmark import "./tasks/benchmark.wdl" as benchmark
import "./tasks/multiqc.wdl" as multiqc import "./tasks/multiqc.wdl" as multiqc
import "./tasks/merge_sentieon_metrics.wdl" as merge_sentieon_metrics
import "./tasks/mendelian.wdl" as mendelian import "./tasks/mendelian.wdl" as mendelian
import "./tasks/merge_mendelian.wdl" as merge_mendelian import "./tasks/merge_mendelian.wdl" as merge_mendelian
import "./tasks/quartet_mendelian.wdl" as quartet_mendelian import "./tasks/quartet_mendelian.wdl" as quartet_mendelian
disk_size=disk_size, disk_size=disk_size,
cluster_config=BIGcluster_config cluster_config=BIGcluster_config
} }

call sentieon.sentieon as sentieon {
input:
quality_yield=deduped_Metrics.deduped_QualityYield,
wgs_metrics_algo=deduped_Metrics.deduped_wgsmetrics,
aln_metrics=deduped_Metrics.dedeuped_aln_metrics,
is_metrics=deduped_Metrics.deduped_is_metrics,
sample=quartet[2],
docker=SENTIEONdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size
}

call Realigner.Realigner as Realigner { call Realigner.Realigner as Realigner {
input: input:
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR, SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
disk_size=disk_size disk_size=disk_size
} }


call merge_sentieon_metrics.merge_sentieon_metrics as merge_sentieon_metrics {
input:
quality_yield_header=sentieon.quality_yield_header,
wgs_metrics_algo_header=sentieon.wgs_metrics_algo_header,
aln_metrics_header=sentieon.aln_metrics_header,
is_metrics_header=sentieon.is_metrics_header,
quality_yield_data=sentieon.quality_yield_data,
wgs_metrics_algo_data=sentieon.wgs_metrics_algo_data,
aln_metrics_data=sentieon.aln_metrics_data,
is_metrics_data=sentieon.is_metrics_data,
project=project,
docker=MULTIQCdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size
}

call extract_tables.extract_tables as extract_tables { call extract_tables.extract_tables as extract_tables {
input: input:
quality_yield_summary=merge_sentieon_metrics.quality_yield_summary,
wgs_metrics_summary=merge_sentieon_metrics.wgs_metrics_summary,
aln_metrics_summary=merge_sentieon_metrics.aln_metrics_summary,
is_metrics_summary=merge_sentieon_metrics.is_metrics_summary,
fastqc=multiqc.fastqc, fastqc=multiqc.fastqc,
fastqscreen=multiqc.fastqscreen, fastqscreen=multiqc.fastqscreen,
hap=multiqc.hap, hap=multiqc.hap,
aln=deduped_Metrics.dedeuped_aln_metrics,
quality_yield=deduped_Metrics.deduped_QualityYield,
wgs_metrics=deduped_Metrics.deduped_wgsmetrics,
docker=DIYdocker,
docker=MULTIQCdocker,
cluster_config=SMALLcluster_config, cluster_config=SMALLcluster_config,
disk_size=disk_size disk_size=disk_size
}
}


Boolean sister_tag = read_boolean(split_gvcf_files.sister_tag) Boolean sister_tag = read_boolean(split_gvcf_files.sister_tag)
Boolean quartet_tag = read_boolean(split_gvcf_files.quartet_tag) Boolean quartet_tag = read_boolean(split_gvcf_files.quartet_tag)

読み込み中…
キャンセル
保存