@@ -0,0 +1,67 @@ | |||
### APP简介 | |||
本app用于从FASTQ文件中随机抽取N条read,生成down-sample后的FASTQ文件。 | |||
app仅接受以***.fastq.gz***或***.fq.gz***为结尾的文件,输出文件为GZIP压缩后的FASTQ文件(后缀名为.fastq.gz) | |||
> 注意: | |||
> | |||
> 本app使用***seqkit sample***进行downsample,根据官方文件说明,downsample后的FASTQ文件的read数不一定严格等于down_to的目标。如:down_to=1000 时,可能最终获得的fastq中仅有991条read。 | |||
> | |||
> 对此不能接受者,请不要使用本app。 | |||
### 安装指南 | |||
``` | |||
# 激活choppy环境 | |||
source activate choppy-pipe-0.3.8.dev0 | |||
# 安装 app | |||
choppy install chenziyin/downsample | |||
``` | |||
### 快速使用 | |||
1. 准备样本描述文件:samples.csv | |||
需要包括3列: | |||
- sample_id:每个样本唯一的ID | |||
- fastq:原始fastq地址 | |||
- down_to:downsample后目标read数 | |||
样表如下: | |||
| sample_id | Fastq | down_to | | |||
| --------- | ------------------------------------------------------------ | ------- | | |||
| Test1 | oss://choppy-app-example-data/miRNAseq/test_10k_NEXTflex.fastq.gz | 1000 | | |||
2. 批量提交任务 | |||
```bash | |||
choppy batch chenziyin/downsample-latest <SAMPLES_CSV> --project-name <PROJECT_NAME> | |||
``` | |||
### 使用的软件及版本: | |||
- Seqkit: 0.12.0 | |||
@@ -0,0 +1,4 @@ | |||
{ | |||
"cluster_config": "OnDemand ecs.sn1.medium img-ubuntu-vpc", | |||
"disk_size": "200" | |||
} |
@@ -0,0 +1,9 @@ | |||
{ | |||
"{{ project_name }}.sample_id": "{{ sample_id }}", | |||
"{{ project_name }}.in_fastq": "{{ fastq }}", | |||
"{{ project_name }}.N": "{{ down_to }}", | |||
"{{ project_name }}.docker_seqkit": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/seqkit:0.12.0", | |||
"{{ project_name }}.cluster_config": "{{ cluster_config }}", | |||
"{{ project_name }}.disk_size": "{{ disk_size }}" | |||
} | |||
@@ -0,0 +1,26 @@ | |||
task DownSample { | |||
File in_fastq | |||
Int N | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
set -o pipefail | |||
set -e | |||
out_fastq=$(echo "$(basename ${in_fastq})" | sed "s/.\(fastq\|fq\).gz/_downTo${N}.fastq.gz/") | |||
seqkit sample -n ${N} ${in_fastq} -o $out_fastq | |||
echo "$out_fastq" | |||
>>> | |||
runtime { | |||
docker: docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File out_fastq = sub(basename(in_fastq), "\\.(fastq|fq)\\.gz$", "_downTo${N}.fastq.gz") | |||
} | |||
} |
@@ -0,0 +1,17 @@ | |||
import "./tasks/DownSample.wdl" as DownSample | |||
workflow {{ project_name }} { | |||
String sample_id | |||
File in_fastq | |||
Int N | |||
String docker_seqkit | |||
String cluster_config | |||
String disk_size | |||
call DownSample.DownSample as DownSample { | |||
input: | |||
in_fastq=in_fastq, N=N, docker=docker_seqkit, cluster_config=cluster_config, disk_size=disk_size | |||
} | |||
} |