Przeglądaj źródła

use splited gvcf as mendelian analysis input

master
LUYAO REN 5 lat temu
rodzic
commit
05e1855830
8 zmienionych plików z 154 dodań i 99 usunięć
  1. +35
    -0
      codescripts/get_family.sh
  2. +4
    -3
      codescripts/merge_mendelian_vot.py
  3. +9
    -7
      inputs
  4. +46
    -0
      tasks/mendelian.wdl
  5. +0
    -33
      tasks/merge_chromo.wdl
  6. +35
    -0
      tasks/merge_mendelian.wdl
  7. +0
    -31
      tasks/merge_mendelian_vote.wdl
  8. +25
    -25
      workflow.wdl

+ 35
- 0
codescripts/get_family.sh Wyświetl plik

@@ -0,0 +1,35 @@
cat chr*gt | cut -f1-9,10,41,74,107 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_BGI_SEQ2000_BGI_1_20180518.vcf
cat chr*gt | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$11"\t"$52"\t"$85"\t"$12}' | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_BGI_SEQ2000_BGI_2_20180518.vcf
cat chr*gt | cut -f1-9,21,30,63,96 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_BGI_SEQ2000_BGI_3_20180518.vcf

cat chr*gt | cut -f1-9,88,91,94,98 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_BGI_T7_WGE_1_20191105.vcf
cat chr*gt | cut -f1-9,89,92,95,99 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_BGI_T7_WGE_2_20191105.vcf
cat chr*gt | cut -f1-9,90,93,97,100 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_BGI_T7_WGE_3_20191105.vcf

cat chr*gt | cut -f1-9,61,68,75,81 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_ARD_1_20181108.vcf
cat chr*gt | cut -f1-9,62,69,76,82 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_ARD_2_20181108.vcf
cat chr*gt | cut -f1-9,64,70,77,83 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_ARD_3_20181108.vcf

cat chr*gt | cut -f1-9,65,71,78,84 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_ARD_4_20181108.vcf
cat chr*gt | cut -f1-9,66,72,79,86 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_ARD_5_20181108.vcf
cat chr*gt | cut -f1-9,67,73,80,87 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_ARD_6_20181108.vcf

cat chr*gt | cat chr*gt | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$114"\t"$117"\t"$15"\t"$18}' | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_BRG_1_20180930.vcf
cat chr*gt | cat chr*gt | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$115"\t"$13"\t"$16"\t"$19}' | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_BRG_2_20180930.vcf
cat chr*gt | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$116"\t"$14"\t"$17"\t"$20}' | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_BRG_3_20180930.vcf

cat chr*gt | cut -f1-9,101,102,103,104 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_WUX_1_20190917.vcf
cat chr*gt | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$109"\t"$106"\t"$108"\t"$105}' | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_WUX_2_20190917.vcf
cat chr*gt | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$113"\t"$112"\t"$111"\t"$110}' | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_Nova_WUX_3_20190917.vcf

cat chr*gt | cut -f1-9,22,25,28,32 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_ARD_1_20170403.vcf
cat chr*gt | cut -f1-9,23,26,29,33 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_ARD_2_20170403.vcf
cat chr*gt | cut -f1-9,24,27,31,34 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_ARD_3_20170403.vcf

cat chr*gt | cut -f1-9,35,38,42,45 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_NVG_1_20170329.vcf
cat chr*gt | cut -f1-9,36,39,43,46 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_NVG_2_20170329.vcf
cat chr*gt | cut -f1-9,37,40,44,47 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_NVG_3_20170329.vcf

cat chr*gt | cut -f1-9,48,51,55,58 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_WUX_1_20170216.vcf
cat chr*gt | cut -f1-9,49,53,56,59 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_WUX_2_20170216.vcf
cat chr*gt | cut -f1-9,50,54,57,60 | grep -v '#CHROM' | sort -k1,1 -k2,2n | cat header - > Quartet_DNA_ILM_XTen_WUX_3_20170216.vcf

+ 4
- 3
codescripts/merge_mendelian_vot.py Wyświetl plik

@@ -1,11 +1,12 @@
import pandas as pd
import sys, argparse, os

men = pd.read_table(sys.argv[1],header=None)
men = pd.read_table(sys.argv[1],header=None,low_memory=False)
vote = pd.read_table(sys.argv[2],low_memory=False)
mut = pd.read_table(sys.argv[3],header=None)

men[1]=men[1].astype(str)
merged_df = pd.merge(vote, men, how='inner', left_on=['CHROM','POS'], right_on = [0,1])
lcl5_dat = merged_df[(merged_df[5]!='./.') & (merged_df[5]!='0/0')]

merged_df['mendelian_check'] = 'MIE'
merged_df.loc[merged_df[2]=='1:1:1','mendelian_check'] = 'MP'
sub = merged_df[['CHROM','POS','LCL5_detected_num','mendelian_check',2]]

+ 9
- 7
inputs Wyświetl plik

@@ -1,8 +1,10 @@
{
"{{ project_name }}.disk_size": "100",
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}",
"{{ project_name }}.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4",
"{{ project_name }}.sample_name": "{{ sample_name }}",
"{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc",
"{{ project_name }}.mut_file": "oss://pgx-result/renluyao/manuscript/mutation_type"
}
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.family_vcf": "{{ family_vcf }}",
"{{ project_name }}.disk_size": "500",
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc",
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc",
"{{ project_name }}.MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4",
"{{ project_name }}.ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/"
}

+ 46
- 0
tasks/mendelian.wdl Wyświetl plik

@@ -0,0 +1,46 @@
task mendelian {
File family_vcf
File ref_dir
String family_name = basename(family_vcf,".vcf")
String fasta
String docker
String cluster_config
String disk_size
command <<<
export LD_LIBRARY_PATH=/opt/htslib-1.9
nt=$(nproc)

echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped

mkdir VBT_D5
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt

cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf

echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped

mkdir VBT_D6
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt

cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File D5_ped = "${family_name}.D5.ped"
File D6_ped = "${family_name}.D6.ped"
Array[File] D5_mendelian = glob("VBT_D5/*")
Array[File] D6_mendelian = glob("VBT_D6/*")
File D5_trio_vcf = "${family_name}.D5.vcf"
File D6_trio_vcf = "${family_name}.D6.vcf"
}
}




+ 0
- 33
tasks/merge_chromo.wdl Wyświetl plik

@@ -1,33 +0,0 @@
task merge_chromo {
Array[File] mendelian_vote_snv
Array[File] mendelian_vote_indel
String sample_name
String docker
String cluster_config
String disk_size
command <<<

cat ${sep=" " mendelian_vote_snv} > ${sample_name}.snv.txt

cat ${sep=" " mendelian_vote_indel} > ${sample_name}.indel.txt

cat ${sample_name}.snv.txt | cut -f3,4 | sort | uniq -c > ${sample_name}.snv.summary.txt

cat ${sample_name}.indel.txt | cut -f3,4 | sort | uniq -c > ${sample_name}.indel.summary.txt

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File mendelian_vote_summary_snv = "${sample_name}.snv.summary.txt"
File mendelian_vote_summary_indel = "${sample_name}.indel.summary.txt"
File mendelian_vote_summary_snv_detail = "${sample_name}.snv.txt"
File mendelian_vote_summary_indel_detail = "${sample_name}.indel.txt"
}
}

+ 35
- 0
tasks/merge_mendelian.wdl Wyświetl plik

@@ -0,0 +1,35 @@
task merge_mendelian {
File D5_trio_vcf
File D6_trio_vcf
File family_vcf
String family_name = basename(family_vcf,".family.vcf")
String docker
String cluster_config
String disk_size
command <<<
cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt
cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt
cat ${family_vcf} | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[1];
}
}
{ print }
' > ${family_name}.consensus.txt
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name}
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File project_mendelian = "${family_name}.txt"
File project_mendelian_summary = "${family_name}.summary.txt"
}
}

+ 0
- 31
tasks/merge_mendelian_vote.wdl Wyświetl plik

@@ -1,31 +0,0 @@
task merge_mendelian_vote {
File vote_file
File mendelian_file
File mut_file
String output_prefix
String docker
String cluster_config
String disk_size
command <<<

python /opt/merge_mendelian_vote.py ${mendelian_file} ${vote_file} ${mut_file} ${output_prefix}.mendelian.vote.txt

cat ${output_prefix}.mendelian.vote.txt | awk '{ if ((length($6) == 1) && (length($7) == 1)) { print } }' > ${output_prefix}.snv

cat ${output_prefix}.mendelian.vote.txt | awk '{ if ((length($6) > 1) || (length($7) > 1)) { print } }' > ${output_prefix}.indel

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File mendelian_vote = "${output_prefix}.mendelian.vote.txt"
File mendelian_vote_snv = "${output_prefix}.snv"
File mendelian_vote_indel = "${output_prefix}.indel"
}
}

+ 25
- 25
workflow.wdl Wyświetl plik

@@ -1,34 +1,34 @@
import "./tasks/merge_mendelian_vote.wdl" as merge_mendelian_vote
import "./tasks/merge_chromo.wdl" as merge_chromo
import "./tasks/mendelian.wdl" as mendelian
import "./tasks/merge_mendelian.wdl" as merge_mendelian

workflow {{ project_name }} {
File inputSamplesFile
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile)
File mut_file
String docker
String sample_name
String cluster_config
File family_vcf
File ref_dir
String fasta
String MENDELIANdocker
String DIYdocker
String BIGcluster_config
String SMALLcluster_config
String disk_size

scatter (sample in inputSamples){
call merge_mendelian_vote.merge_mendelian_vote as merge_mendelian_vote {
input:
vote_file=sample[0],
mendelian_file=sample[1],
output_prefix=sample[2],
mut_file=mut_file,
docker=docker,
cluster_config=cluster_config,
disk_size=disk_size
}
call mendelian.mendelian as mendelian {
input:
family_vcf=family_vcf,
ref_dir=ref_dir,
fasta=fasta,
docker=MENDELIANdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size
}
call merge_chromo.merge_chromo as merge_chromo {

call merge_mendelian.merge_mendelian as merge_mendelian {
input:
mendelian_vote_snv=merge_mendelian_vote.mendelian_vote_snv,
mendelian_vote_indel=merge_mendelian_vote.mendelian_vote_indel,
sample_name=sample_name,
docker=docker,
cluster_config=cluster_config,
D5_trio_vcf=mendelian.D5_trio_vcf,
D6_trio_vcf=mendelian.D6_trio_vcf,
family_vcf=family_vcf,
docker=DIYdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size
}

}

Ładowanie…
Anuluj
Zapisz