6 年前 · d993f7aaf5
--- a/README.md
+++ b/README.md
@@ -0,0 +1,69 @@
 # Novoalign

 [Novoalign](<http://www.novocraft.com/>) is a commercial mapping software, without any published paper, and a license is needed.

 For Read1 Novoalign uses a seeded alignment process to find alignment locations each with a Read1 alignment score. For each good location found Novoalign does a [NeedlemanWunsch alignment](<https://zh.wikipedia.org/zh-hans/%E5%B0%BC%E5%BE%B7%E6%9B%BC-%E7%BF%81%E6%96%BD%E7%AE%97%E6%B3%95>) of the second read against a region starting from the Read1 alignment and extending 6 standard deviations beyond mean fragment length. The best alignment for Read2 will define the pair score for Read1/Read2. All the alignments are added to a collection for Read1.

 This process is repeated using Read2 seeded alignment and then NW for Read1, creating a collection of Read2/Read1 pairs. There are very likely duplicates amongst the two collections.

 Novoalign then decides whether there is a "proper pair" or not. To do this a structural variation penalty is used as follows.

 Novoalign has a proper pair if the score of the best pair (Read1/Read2 or Read2/Read1 combined score including fragment length penalty) is less than the structural variation penalty (default 70) plus best singleend Read1 score plus best singleend Read2 score.

 If Novoalign has a proper pair, Read1/Read2 & Read2/Read1 lists are combined, removing duplicates and sorting by alignment score. At this point Novoalign has a list of one or more proper pair alignments. This list is passed to reporting which can report one or more alignments depending on the options.

 If there wasn't a proper pair then Novoalign reports alignments to each read in single end mode and the reporting options will decide whether Novoalign reports one or more alignments.

 The result of the paired search can be two paired alignments where the pairing is more probable than a structural variation, or it can be two individual alignments, one to each read of the pair.

 Given the threshold, gap penalties and reads it is quite possible for novoalign to find alignments with gaps in both ends of the reads. There are no design restrictions that prevent this type of result and it depends only on the scoring parameters and threshold. [cited from the manual]

 #####1. index reference seuqneces

 ```bash 
 novoindex GRCh38 GRCh38.d1.vd1.fa
 ```

 ##### 2. Novoalign

 ```bash
 novoalign -d <reference.ndx> -f <read1.fastq.gz> <read2.fastq.gz> -o SAM -c $nt > ${sample}.novoalign.sam
 ```

 ##### 3. covert sam to bam and index bam

 ```bash
 java -jar picard.jar AddOrReplaceReadGroups I=star_output.sam O=rg_added_sorted.bam SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample 

 samtools index rg_added_sorted.bam 
 ```

 ##### 4. Markduplicates

 ```bash
 sentieon driver -t NUMBER_THREADS -i SORTED_BAM \
  --algo LocusCollector --fun score_info SCORE.gz
 sentieon driver -t NUMBER_THREADS -i SORTED_BAM \
  --algo Dedup --rmdup --score_info SCORE.gz  \
  --metrics DEDUP_METRIC_TXT DEDUPED_BAM
 ```

 #####NIST's settings

 ```bash
 novoalign -d <reference.ndx> -f <read1.fastq.gz> <read2.fastq.gz>
 -F STDFQ --Q2Off -t 400 -o SAM -c 10 
 ```

 Parameters explanation 

 `-F` Specifies the format of the read file. Normally Novoalign can detect the format of read files and this option is not required. `STDFQ` means Fastq format with Sanger coding of quality values.
 10log10(Perr) + '!'

 `--Q20ff` For Novoalign disables treating Q=2 bases as Illumina "The Read Segment Quality Control Indicator". Setting Q2 off will treat Q=2 bases as normal bases with a quality of 2. When off Q=2 bases are included in quality calibration and may be recalibrated to higher qualities

 `-t`  Sets absolute threshold or highest alignment score acceptable for the best alignment

 `-o`  Specifies the report format

 `-c`  Sets the number of threads to be used. On licensed versions it defaults to the number of CPUs as reported by sysinfo(). On free version the option is disabled.
--- a/inputs
+++ b/inputs
@@ -0,0 +1,20 @@
 {
  "{{ project_name }}.SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics",
  "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
  "{{ project_name }}.PIdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/picard:2.20.2",
  "{{ project_name }}.platform": "{{ platform }}",
  "{{ project_name }}.NVref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/novoalign/",
  "{{ project_name }}.disk_size": "500",
  "{{ project_name }}.fastq_1": "{{ fastq_1 }}",
  "{{ project_name }}.NVdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/novocraft:V3.09.02",
  "{{ project_name }}.machine": "{{ machine }}",
  "{{ project_name }}.STdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/sentieon-genomics:v2018.08.01",
  "{{ project_name }}.library": "{{ library }}",
  "{{ project_name }}.index": "GRCh38_novoalign",
  "{{ project_name }}.fastq_2": "{{ fastq_2 }}",
  "{{ project_name }}.cluster_config": "OnDemand ecs.sn1ne.8xlarge img-ubuntu-vpc",
  "{{ project_name }}.SAMdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/samtools:v1.3.1",
  "{{ project_name }}.STref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/",
  "{{ project_name }}.sample": "{{ sample }}",
  "{{ project_name }}.id": "{{ id }}"
 }
--- a/tasks/.DS_Store
+++ b/tasks/.DS_Store
--- a/tasks/Dedup.wdl
+++ b/tasks/Dedup.wdl
@@ -0,0 +1,44 @@
 task Dedup {

 	String SENTIEON_INSTALL_DIR
 	String sample

 	File sorted_bam
 	File sorted_bam_index
 	String STdocker
 	String cluster_config
 	String disk_size


 	command <<<
 		set -o pipefail
 		set -e
 		export SENTIEON_LICENSE=192.168.0.55:8990
 		sentieon_bam=`basename ${sorted_bam}`
 		sentieon_bam_index=`basename ${sorted_bam_index}`
 		cp ${sorted_bam} .
 		cp ${sorted_bam_index} .
 		nt=$(nproc)
 		${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt -i $sentieon_bam --algo LocusCollector --fun score_info ${sample}_score.txt
 		${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt -i $sentieon_bam --algo Dedup --rmdup --score_info ${sample}_score.txt --metrics ${sample}_dedup_metrics.txt ${sample}.sorted.deduped.bam	
 	>>>
 	runtime {
 		docker:STdocker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}

 	output {
 		File score = "${sample}_score.txt"
 		File dedup_metrics = "${sample}_dedup_metrics.txt"
 		File Dedup_bam = "${sample}.sorted.deduped.bam"
 		File Dedup_bam_index = "${sample}.sorted.deduped.bam.bai"
 	}
 }






--- a/tasks/Metrics.wdl
+++ b/tasks/Metrics.wdl
@@ -0,0 +1,61 @@
 task Metrics {


    File STref_dir
 	String SENTIEON_INSTALL_DIR
 	String sample
 	String STdocker
 	String cluster_config

 	String fasta
 	File sorted_bam
 	File sorted_bam_index
 	String disk_size
 	


 	command <<<
 		set -o pipefail
 		set -e
 		export SENTIEON_LICENSE=192.168.0.55:8990
 		nt=$(nproc)
 		sentieon_bam=`basename ${sorted_bam}`
 		sentieon_bam_index=`basename ${sorted_bam_index}`
 		cp ${sorted_bam} .
 		cp ${sorted_bam_index} .
 		${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${STref_dir}/${fasta} -t $nt -i $sentieon_bam \
 		--algo GCBias --summary ${sample}_gc_summary.txt ${sample}_gc_metrics.txt \
 		--algo MeanQualityByCycle ${sample}_mq_metrics.txt \
 		--algo QualDistribution ${sample}_qd_metrics.txt \
 		--algo InsertSizeMetricAlgo ${sample}_is_metrics.txt \
 		--algo AlignmentStat ${sample}_aln_metrics.txt \
 		--algo CoverageMetrics --omit_base_output ${sample}_coverage_metrics

 	>>>
 	
 	runtime {
 		docker:STdocker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File qd_metrics = "${sample}_qd_metrics.txt"
 		File mq_metrics = "${sample}_mq_metrics.txt"
 		File is_metrics = "${sample}_is_metrics.txt"
 		File gc_summary = "${sample}_gc_summary.txt"
 		File gc_metrics = "${sample}_gc_metrics.txt"
 		File aln_metrics = "${sample}_aln_metrics.txt"
 		File coverage_metrics_sample_summary = "${sample}_coverage_metrics.sample_summary"
 		File coverage_metrics_sample_statistics = "${sample}_coverage_metrics.sample_statistics"
 		File coverage_metrics_sample_interval_statistics = "${sample}_coverage_metrics.sample_interval_statistics"
 		File coverage_metrics_sample_cumulative_coverage_proportions = "${sample}_coverage_metrics.sample_cumulative_coverage_proportions"
 		File coverage_metrics_sample_cumulative_coverage_counts = "${sample}_coverage_metrics.sample_cumulative_coverage_counts"
 	}

 }





--- a/tasks/SamToBam.wdl
+++ b/tasks/SamToBam.wdl
@@ -0,0 +1,27 @@
 task SamToBam {
 	File aligned_sam
 	String PIdocker
 	String sample
 	String cluster_config
 	String disk_size
 	String id
 	String library
 	String platform
 	String machine

 	command <<<
 		set -o pipefail
 		set -e	
 		java -jar /usr/bin/picard/picard.jar AddOrReplaceReadGroups I=${aligned_sam} O=${sample}.bam SO=coordinate RGID=${id} RGLB=${library} RGPL=${platform} RGPU=${machine} RGSM=${sample} 
 	>>>

 	runtime {
 		docker:PIdocker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File sorted_bam = "${sample}.bam"
 	}
 }
--- a/tasks/deduped_Metrics.wdl
+++ b/tasks/deduped_Metrics.wdl
@@ -0,0 +1,49 @@
 task deduped_Metrics {

    File STref_dir
    
 	String SENTIEON_INSTALL_DIR
 	String sample
 	String fasta
 	File Dedup_bam
 	File Dedup_bam_index
 	String STdocker
 	String cluster_config
 	String disk_size


 	command <<<
 		set -o pipefail
 		set -e
 		export SENTIEON_LICENSE=192.168.0.55:8990
 		nt=$(nproc)
 		${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${STref_dir}/${fasta} -t $nt -i ${Dedup_bam} \
 		--algo CoverageMetrics --omit_base_output ${sample}_deduped_coverage_metrics \
 		--algo MeanQualityByCycle ${sample}_deduped_mq_metrics.txt \
 		--algo QualDistribution ${sample}_deduped_qd_metrics.txt \
 		--algo GCBias --summary ${sample}_deduped_gc_summary.txt ${sample}_deduped_gc_metrics.txt \
 		--algo AlignmentStat ${sample}_deduped_aln_metrics.txt \
 		--algo InsertSizeMetricAlgo ${sample}_deduped_is_metrics.txt
 	>>>

 	runtime {
 		docker:STdocker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" 
 	}

 	output {
 		File deduped_qd_metrics = "${sample}_deduped_qd_metrics.txt"
 		File deduped_mq_metrics = "${sample}_deduped_mq_metrics.txt"
 		File deduped_is_metrics = "${sample}_deduped_is_metrics.txt"
 		File deduped_gc_summary = "${sample}_deduped_gc_summary.txt"
 		File deduped_gc_metrics = "${sample}_deduped_gc_metrics.txt"
 		File deduped_aln_metrics = "${sample}_deduped_aln_metrics.txt"
 		File deduped_coverage_metrics_sample_summary = "${sample}_deduped_coverage_metrics.sample_summary"
 		File deduped_coverage_metrics_sample_statistics = "${sample}_deduped_coverage_metrics.sample_statistics"
 		File deduped_coverage_metrics_sample_interval_statistics = "${sample}_deduped_coverage_metrics.sample_interval_statistics"
 		File deduped_coverage_metrics_sample_cumulative_coverage_proportions = "${sample}_deduped_coverage_metrics.sample_cumulative_coverage_proportions"
 		File deduped_coverage_metrics_sample_cumulative_coverage_counts = "${sample}_deduped_coverage_metrics.sample_cumulative_coverage_counts"
 	}
 }
--- a/tasks/indexBam.wdl
+++ b/tasks/indexBam.wdl
@@ -0,0 +1,23 @@
 task indexBam {
 	File sorted_bam
 	String sample
 	String SAMdocker
 	String cluster_config
 	String disk_size

 	command <<<
 		set -o pipefail
 		set -e	
 		/opt/conda/bin/samtools index ${sorted_bam} ${sample}.bam.bai
 	>>>

 	runtime {
 		docker:SAMdocker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File sorted_bam_index = "${sample}.bam.bai"
 	}
 }
--- a/tasks/novoalign.wdl
+++ b/tasks/novoalign.wdl
@@ -0,0 +1,28 @@
 task novoalign {

    File NVref_dir
    String index
 	File fastq_1
 	File fastq_2
 	String NVdocker
 	String sample
 	String cluster_config
 	String disk_size

 	command <<<
 		set -o pipefail
 		set -e	
 		nt=$(nproc)
 		novoalign -d ${NVref_dir}/${index} -f ${fastq_1} ${fastq_2} -o SAM -c $nt > ${sample}.novoalign.sam
 	>>>

 	runtime {
 		docker:NVdocker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File aligned_sam = "${sample}.novoalign.sam"
 	}
 }
--- a/workflow.wdl
+++ b/workflow.wdl
@@ -0,0 +1,103 @@
 import "./tasks/novoalign.wdl" as novoalign
 import "./tasks/SamToBam.wdl" as SamToBam
 import "./tasks/indexBam.wdl" as indexBam
 import "./tasks/Metrics.wdl" as Metrics
 import "./tasks/Dedup.wdl" as Dedup
 import "./tasks/deduped_Metrics.wdl" as deduped_Metrics

 workflow {{ project_name }} {

 	File fastq_1
 	File fastq_2
 	File NVref_dir
 	File STref_dir

 	String index
 	String SENTIEON_INSTALL_DIR
 	String sample
 	String STdocker
 	String SAMdocker
 	String NVdocker
 	String PIdocker
 	String fasta
 	String disk_size
 	String cluster_config

 	String id
 	String library
 	String platform
 	String machine

 	call novoalign.novoalign as novoalign {
 		input: 
 		NVref_dir=NVref_dir,
 		index=index,
 		fastq_1=fastq_1,
 		fastq_2=fastq_2,
 		NVdocker=NVdocker,
 		sample=sample,
 		cluster_config=cluster_config,
 		disk_size=disk_size
 	}

 	call SamToBam.SamToBam as SamToBam {
 		input:
 		aligned_sam=novoalign.aligned_sam,
 		sample=sample,
 		id=id,
 		library=library,
 		platform=platform,
 		machine=machine, 
 		PIdocker=PIdocker,
 		disk_size=disk_size,
 		cluster_config=cluster_config
 	}

 	call indexBam.indexBam as indexBam {
 		input: 
 		sample=sample,
 		sorted_bam=SamToBam.sorted_bam,
 		SAMdocker=SAMdocker,
 		disk_size=disk_size,
 		cluster_config=cluster_config
 	}

 	call Metrics.Metrics as Metrics {
 		input:
 		SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
 		fasta=fasta,
 		STref_dir=STref_dir,
 		sorted_bam=SamToBam.sorted_bam,
 		sorted_bam_index=indexBam.sorted_bam_index,		
 		sample=sample,
 		STdocker=STdocker,
 		disk_size=disk_size,
 		cluster_config=cluster_config
 	}

 	call Dedup.Dedup as Dedup {
 		input:
 		SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
 		sorted_bam=SamToBam.sorted_bam,
 		sorted_bam_index=indexBam.sorted_bam_index,		
 		sample=sample,
 		STdocker=STdocker,
 		disk_size=disk_size,
 		cluster_config=cluster_config
 	}

 	call deduped_Metrics.deduped_Metrics as deduped_Metrics {
 		input:
 		SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
 		fasta=fasta,
 		STref_dir=STref_dir,
 		Dedup_bam=Dedup.Dedup_bam,
 		Dedup_bam_index=Dedup.Dedup_bam_index,
 		sample=sample,
 		STdocker=STdocker,
 		disk_size=disk_size,
 		cluster_config=cluster_config
 	}

 	}