преди 3 години · 3884c9e15f
--- a/README.md
+++ b/README.md
@@ -0,0 +1,69 @@
 # fastqc-fastqscreen

 > Author：Liuruimei
 >
 > E-mail： 20110700157@fudan.edu.cn
 >

 ## 安装指南

 ```
 # 激活choppy环境
 open-choppy-env
 # 安装app
 choppy install liuruimei/fastqc-fastqscreen
 ```

 ## App
 This is for basic QC including fastqc and fastqscreen.

 ## 流程与参数
 ###  原始数据质量控制

 #### [Fastqc](<https://www.bioinformatics.babraham.ac.uk/projects/fastqc/>) v0.11.5

 FastQC是一个常用的测序原始数据的质控软件，主要包括12个模块，具体请参考[Fastqc模块详情](<https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/>)。

 ```bash
 fastqc -t <threads> -o <output_directory> <fastq_file>
 ```

 #### [Fastq Screen](<https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/>) 0.12.0

 Fastq Screen是检测测序原始数据中是否引⼊入其他物种，或是接头引物等污染，⽐比如，如果测序样本
 是⼈人类，我们期望99%以上的reads匹配到⼈人类基因组，10%左右的reads匹配到与⼈人类基因组同源性
 较⾼高的⼩小⿏鼠上。如果有过多的reads匹配到Ecoli或者Yeast，要考虑是否在培养细胞的时候细胞系被污
 染，或者建库时⽂文库被污染。

 ```bash
 fastq_screen --aligner <aligner> --conf <config_file> --top <number_of_reads> --threads <threads> <fastq_file>
 ```

 `--conf` conifg 文件主要输入了多个物种的fasta文件地址，可根据自己自己的需求下载其他物种的fasta文件加入分析

 `--top`一般不需要对整个fastq文件进行检索，取前100000行

 ## 结果展示与解读

 原始数据质量控制主要通过考察测序数据的基本特征判断数据质量的好坏，比如数据量是否达到要求、reads的重复率是否过多、碱基质量、ATGC四种碱基的分布、GC含量、接头序列含量以及是否有其他物种的污染等等。

 FastQC和FastqScreen是两个常用的原始数据质量控制软件

 总结表格 **pre_alignment.txt**

 | 列名                      | 说明                                 |
 | ------------------------- | ------------------------------------ |
 | Sample                    | 样本名，R1结尾为read1，R2结尾为read2 |
 | %Dup                      | % Duplicate reads                    |
 | %GC                       | Average % GC content                 |
 | Total Sequences (million) | Total sequences                      |
 | %Human                    | 比对到人类基因组的比例               |
 | %EColi                    | 比对到大肠杆菌基因组的比例           |
 | %Adapter                  | 比对到接头序列的比例                 |
 | %Vector                   | 比对到载体基因组的比例               |
 | %rRNA                     | 比对到rRNA序列的比例                 |
 | %Virus                    | 比对到病毒基因组的比例               |
 | %Yeast                    | 比对到酵母基因组的比例               |
 | %Mitoch                   | 比对到线粒体序列的比例               |
 | %No hits                  | 没有比对到以上基因组的比例           |

--- a/defaults
+++ b/defaults
@@ -0,0 +1,15 @@
 {

  "fasta": "GRCh38.d1.vd1.fa",
  "BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest",
  "disk_size": "200",
  "FASTQCdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:0.11.8",
  "SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc",
  "screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/",
  "BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc",
  "fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf",
  "FASTQSCREENdocker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0",
  "ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/"
 }


--- a/inputs
+++ b/inputs
@@ -0,0 +1,13 @@
 {
  "{{ project_name }}.fasta": "{{ fasta }}",
  "{{ project_name }}.disk_size": "{{ disk_size }}",
  "{{ project_name }}.FASTQCdocker": "{{ FASTQCdocker }}",
  "{{ project_name }}.read2": "{{ read2 }}",
  "{{ project_name }}.sample_name": "{{ sample_name }}",
  "{{ project_name }}.read1": "{{ read1 }}",
  "{{ project_name }}.screen_ref_dir": "{{ screen_ref_dir }}",
  "{{ project_name }}.BIGcluster_config": "{{ BIGcluster_config }}",
  "{{ project_name }}.fastq_screen_conf": "{{ fastq_screen_conf }}",
  "{{ project_name }}.FASTQSCREENdocker": "{{ FASTQSCREENdocker }}",
  "{{ project_name }}.ref_dir": "{{ ref_dir }}"
 }
--- a/tasks/fastqc.wdl
+++ b/tasks/fastqc.wdl
@@ -0,0 +1,28 @@
 task fastqc {
 	File read1
 	File read2
 	String docker
 	String cluster_config
 	String disk_size

 	command <<<
 		set -o pipefail
 		set -e
 		nt=$(nproc)
 		fastqc -t $nt -o ./ ${read1}
 		fastqc -t $nt -o ./ ${read2}
 	>>>

 	runtime {
 		docker:docker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File read1_html = sub(basename(read1), "\\.(fastq|fq)\\.gz$", "_fastqc.html")
 		File read1_zip = sub(basename(read1), "\\.(fastq|fq)\\.gz$", "_fastqc.zip")
 		File read2_html = sub(basename(read2), "\\.(fastq|fq)\\.gz$", "_fastqc.html")
 		File read2_zip = sub(basename(read2), "\\.(fastq|fq)\\.gz$", "_fastqc.zip")
 	}
 }
--- a/tasks/fastqscreen.wdl
+++ b/tasks/fastqscreen.wdl
@@ -0,0 +1,36 @@
 task fastq_screen {
 	File read1
 	File read2
 	File screen_ref_dir
 	File fastq_screen_conf
 	String read1name = basename(read1,".fastq.gz")
 	String read2name = basename(read2,".fastq.gz")
 	String docker
 	String cluster_config
 	String disk_size

 	command <<<
 		set -o pipefail
 		set -e
 		nt=$(nproc)
 		mkdir -p /cromwell_root/tmp
 		cp -r ${screen_ref_dir} /cromwell_root/tmp/
 		fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${read1}
 		fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${read2}
 	>>>

 	runtime {
 		docker:docker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File png1 = "${read1name}_screen.png"
 		File txt1 = "${read1name}_screen.txt"
 		File html1 = "${read1name}_screen.html"
 		File png2 = "${read2name}_screen.png"
 		File txt2 = "${read2name}_screen.txt"
 		File html2 = "${read2name}_screen.html"
 	}
 }
--- a/workflow.wdl
+++ b/workflow.wdl
@@ -0,0 +1,44 @@
 import "./tasks/fastqc.wdl" as fastqc
 import "./tasks/fastqscreen.wdl" as fastqscreen

 workflow  {{ project_name }}  {

 	File read1
 	File read2

 	String FASTQCdocker
 	String FASTQSCREENdocker

 	String fasta
 	File ref_dir
 	File screen_ref_dir
 	File fastq_screen_conf

 	String sample_name
 	String disk_size
 	String BIGcluster_config


 		call fastqc.fastqc as fastqc {
 			input:
 			read1=read1,
 			read2=read2,
 			docker=FASTQCdocker,
 			cluster_config=BIGcluster_config,
 			disk_size=disk_size
 		}

 		call fastqscreen.fastq_screen as fastqscreen {
 			input:
 			read1=read1,
 			read2=read2,
 			screen_ref_dir=screen_ref_dir,
 			fastq_screen_conf=fastq_screen_conf,
 			docker=FASTQSCREENdocker,
 			cluster_config=BIGcluster_config,
 			disk_size=disk_size
 		}

 		
 }