YaqingLiu
/
VEP


			
							task VEP {

  File vcf
  String sample_id
  String basename = basename(vcf,".vcf")
  File ref_dir
  String fasta
  String vep_path
  File cache
  String ncbi_build
  String species
  String vcf2maf_path
  String docker
  String cluster_config
  String disk_size


  command <<<
    set -o pipefail
    set -e
    nt=$(nproc)

    source /etc/profile

    #awk -F'\t' '{if(($1~"^#")||($1!~"^#" && $7=="PASS")){print $0}}' ${vcf} > ${sample_id}.vcf

    # Set the buffer_size based on the data size
    nrow=`awk -F'\t' '{if($1~"^chr"){print $0}}' ${vcf} | wc -l`
    if [ $nrow -lt 5000 ]; then
      buffer_size="--buffer_size 5000"
    else
      buffer_size="--buffer_size 50"
    fi


    # Judge the SAMPLE info of vcf file
    sample_col_1=`awk -F'\t' '{if($1~"^#CHROM"){print $10}}' ${vcf}`
    sample_col_2=`awk -F'\t' '{if($1~"^#CHROM"){print $11}}' ${vcf}`
    
    if [ $sample_col_2 ]; then # This situation means there are pairs
      SAMPLE_vcf2maf="--tumor-id $sample_col_2 --normal-id $sample_col_1"
      SAMPLE_vcf2vcf="--vcf-tumor-id $sample_col_2 --vcf-normal-id $sample_col_1"
      cp ${vcf} ${sample_id}.tmp1.vcf
    else # Tumor-only or normal-only
      SAMPLE_vcf2maf="--tumor-id $sample_col_1"
      # Add a column and remove it after vcf2vcf
      SAMPLE_vcf2vcf="--vcf-tumor-id $sample_col_1 --vcf-normal-id $sample_col_1"
      awk -F'\t' 'OFS="\t" {if($1!~"^##" && length($11)==0) $11=$10; print $0}' ${vcf} > ${sample_id}.tmp1.vcf
    fi


    # vcf2vcf: transfer into a standardized format
    echo "Transfer the VCF file into a standardized format..."
    perl ${vcf2maf_path}/vcf2vcf.pl \
    --input-vcf ${sample_id}.tmp1.vcf --output-vcf ${sample_id}.tmp2.vcf \
    $SAMPLE_vcf2vcf \
    --ref-fasta ${ref_dir}/${fasta}
    
    if [ $sample_col_2 ]; then
      cp ${sample_id}.tmp2.vcf ${basename}.norm.vcf
    else
      cut -f 1,2,3,4,5,6,7,8,9,10 ${sample_id}.tmp2.vcf > ${basename}.norm.vcf
    fi

    # VEP annotation
    echo "VEP annotation..."
    perl ${vep_path}/vep --format vcf --vcf \
    --input_file ${basename}.norm.vcf --output_file ${basename}.vep.vcf \
    --assembly ${ncbi_build} \
    --species ${species} \
    --everything --af_exac \
    --offline \
    --cache --dir_cache ${cache} \
    --fasta ${ref_dir}/${fasta} \
    $buffer_size \
    --fork $nt
    

    # vcf2maf
    echo "vcf2maf..."
    perl ${vcf2maf_path}/vcf2maf.pl \
    --inhibit-vep \
    --input-vcf ${basename}.vep.vcf --output-maf ${basename}.maf \
    $SAMPLE_vcf2maf \
    --ref-fasta ${ref_dir}/${fasta} \
    --ncbi-build ${ncbi_build} \
    --species ${species}

  >>>
  
  runtime {
    docker: docker
    cluster: cluster_config
    systemDisk: "cloud_ssd 40"
    dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
  }

  output {
    File norm_vcf = "${basename}.norm.vcf"
    File vep_vcf = "${basename}.vep.vcf"
    File vep_vcf_summary = "${basename}.vep.vcf_summary.html"
    File maf = "${basename}.maf"
  }
}