Sfoglia il codice sorgente

first commit

master
LUYAO REN 4 anni fa
commit
c4355a44d6
54 ha cambiato i file con 3501 aggiunte e 0 eliminazioni
  1. BIN
      codescripts/.DS_Store
  2. +60
    -0
      codescripts/D5_D6.py
  3. +37
    -0
      codescripts/Indel_bed.py
  4. +72
    -0
      codescripts/bed_region.py
  5. +67
    -0
      codescripts/cluster.sh
  6. +23
    -0
      codescripts/contig.txt
  7. +65
    -0
      codescripts/extract_multiqc.py
  8. +92
    -0
      codescripts/extract_vcf_information.py
  9. +47
    -0
      codescripts/filter_indel_over_50_cluster.py
  10. +43
    -0
      codescripts/filter_indel_over_50_mendelian.py
  11. +36
    -0
      codescripts/get_filtered_benchmark_vcfinfo.py
  12. +416
    -0
      codescripts/high_confidence_call_vote.py
  13. +42
    -0
      codescripts/high_voted_mendelian_bed.py
  14. +50
    -0
      codescripts/how_many_samples.py
  15. +42
    -0
      codescripts/lcl5_all_called_variants.py
  16. +6
    -0
      codescripts/linux_command.sh
  17. +129
    -0
      codescripts/merge_mendelian_vcfinfo.py
  18. +71
    -0
      codescripts/merge_two_family.py
  19. +115
    -0
      codescripts/merge_two_family_with_genotype.py
  20. +109
    -0
      codescripts/oneClass.py
  21. +144
    -0
      codescripts/reformVCF.py
  22. +227
    -0
      codescripts/replicates_consensus.py
  23. +36
    -0
      codescripts/vcf2bed.py
  24. +295
    -0
      codescripts/voted_by_vcfinfo_mendelianinfo.py
  25. +19
    -0
      inputs
  26. +4
    -0
      rename.txt
  27. +15
    -0
      samples
  28. BIN
      tasks/.DS_Store
  29. +48
    -0
      tasks/BQSR.wdl
  30. +49
    -0
      tasks/D5_D6.wdl
  31. +40
    -0
      tasks/Dedup.wdl
  32. +34
    -0
      tasks/GVCFtyper.wdl
  33. +34
    -0
      tasks/Haplotyper_gVCF.wdl
  34. +53
    -0
      tasks/Metrics.wdl
  35. +41
    -0
      tasks/Realigner.wdl
  36. +105
    -0
      tasks/benchmark.wdl
  37. +44
    -0
      tasks/corealigner.wdl
  38. +37
    -0
      tasks/deduped_Metrics.wdl
  39. +29
    -0
      tasks/extract_multiqc.wdl
  40. +28
    -0
      tasks/fastqc.wdl
  41. +36
    -0
      tasks/fastqscreen.wdl
  42. +34
    -0
      tasks/mapping.wdl
  43. +79
    -0
      tasks/mendelian.wdl
  44. +26
    -0
      tasks/mergeNum.wdl
  45. +40
    -0
      tasks/mergeSentieon.wdl
  46. +58
    -0
      tasks/merge_family.wdl
  47. +72
    -0
      tasks/merge_mendelian.wdl
  48. +56
    -0
      tasks/multiqc.wdl
  49. +27
    -0
      tasks/qualimap.wdl
  50. +26
    -0
      tasks/quartet_mendelian.wdl
  51. +41
    -0
      tasks/sentieon.wdl
  52. +50
    -0
      tasks/split_gvcf_files.wdl
  53. +25
    -0
      tasks/vcfstat.wdl
  54. +127
    -0
      workflow.wdl

BIN
codescripts/.DS_Store Vedi File


+ 60
- 0
codescripts/D5_D6.py Vedi File

@@ -0,0 +1,60 @@
from __future__ import division
import pandas as pd
import sys, argparse, os


# input arguments
parser = argparse.ArgumentParser(description="this script is to calculate reproducibility between Quartet_D5 and Quartet_D6s")

parser.add_argument('-sister', '--sister', type=str, help='sister.txt', required=True)
parser.add_argument('-project', '--project', type=str, help='project name', required=True)


args = parser.parse_args()
sister_file = args.sister
project_name = args.project

# output file
output_name = project_name + '.sister.reproducibility.txt'

output_file = open(output_name,'w')

# input files
sister_dat = pd.read_table(sister_file)

sister_same = 0
sister_diff = 0

for row in sister_dat.itertuples():
# sister
if row[5] == row[6]:
if row[5] == './.':
mendelian = 'noInfo'
sister_count = "no"
elif row[5] == '0/0':
mendelian = 'Ref'
sister_count = "no"
else:
mendelian = '1'
sister_count = "yes_same"
else:
mendelian = '0'
if (row[5] == './.' or row[5] == '0/0') and (row[6] == './.' or row[6] == '0/0'):
sister_count = "no"
else:
sister_count = "yes_diff"
if sister_count == 'yes_same':
sister_same += 1
elif sister_count == 'yes_diff':
sister_diff += 1
else:
pass

sister = sister_same/(sister_same + sister_diff)
outcolumn = 'Project\tReproducibility_D5_D6\n'
outResult = project_name + '\t' + str(sister) + '\n'
output_file.write(outcolumn)
output_file.write(outResult)




+ 37
- 0
codescripts/Indel_bed.py Vedi File

@@ -0,0 +1,37 @@
import pandas as pd
import sys, argparse, os
mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/MIE/vcf/mutation_type',header=None)
outIndel = open(sys.argv[1],'w')
for row in mut.itertuples():
if ',' in row._4:
alt_seq = row._4.split(',')
alt_len = [len(i) for i in alt_seq]
alt = max(alt_len)
else:
alt = len(row._4)
ref = row._3
pos = row._2
if len(ref) == 1 and alt == 1:
pass
elif len(ref) > alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(ref) - 1)
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
elif alt > len(ref):
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
elif len(ref) == alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)








+ 72
- 0
codescripts/bed_region.py Vedi File

@@ -0,0 +1,72 @@
import pandas as pd
import sys, argparse, os
mut = mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/vcf/mutation_type',header=None)
vote = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/benchmark.vote.mendelian.txt',header=None)
merged_df = pd.merge(vote, mut, how='inner', left_on=[0,1], right_on = [0,1])
outFile = open(sys.argv[1],'w')
outIndel = open(sys.argv[2],'w')
for row in merged_df.itertuples():
#d5
if ',' in row._7:
d5 = row._7.split(',')
d5_len = [len(i) for i in d5]
d5_alt = max(d5_len)
else:
d5_alt = len(row._7)
#d6
if ',' in row._15:
d6 = row._15.split(',')
d6_len = [len(i) for i in d6]
d6_alt = max(d6_len)
else:
d6_alt = len(row._15)
#f7
if ',' in row._23:
f7 = row._23.split(',')
f7_len = [len(i) for i in f7]
f7_alt = max(f7_len)
else:
f7_alt = len(row._23)
#m8
if ',' in row._31:
m8 = row._31.split(',')
m8_len = [len(i) for i in m8]
m8_alt = max(m8_len)
else:
m8_alt = len(row._31)
all_length = [d5_alt,d6_alt,f7_alt,m8_alt]
alt = max(all_length)
ref = row._35
pos = int(row._2)
if len(ref) == 1 and alt == 1:
StartPos = int(pos) -1
EndPos = int(pos)
cate = 'SNV'
elif len(ref) > alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(ref) - 1)
cate = 'INDEL'
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
elif alt > len(ref):
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
cate = 'INDEL'
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
elif len(ref) == alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
cate = 'INDEL'
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
outIndel.write(outline_indel)
outline = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row._2) + '\t' + cate + '\n'
outFile.write(outline)









+ 67
- 0
codescripts/cluster.sh Vedi File

@@ -0,0 +1,67 @@
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '0/0' > LCL5.body
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '0/0' > LCL6.body
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '0/0'> LCL7.body
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}'| grep -v '0/0' > LCL8.body
cat header5 LCL5.body > LCL5.beforediffbed.vcf
cat header6 LCL6.body > LCL6.beforediffbed.vcf
cat header7 LCL7.body > LCL7.beforediffbed.vcf
cat header8 LCL8.body > LCL8.beforediffbed.vcf
rtg bgzip *beforediffbed.vcf
rtg index *beforediffbed.vcf.gz

rtg vcffilter -i LCL5.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL5.afterfilterdiffbed.vcf.gz
rtg vcffilter -i LCL6.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL6.afterfilterdiffbed.vcf.gz
rtg vcffilter -i LCL7.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL7.afterfilterdiffbed.vcf.gz
rtg vcffilter -i LCL8.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL8.afterfilterdiffbed.vcf.gz

/mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/table_annovar.pl LCL5.beforediffbed.vcf.gz /mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/humandb \
-buildver hg38 \
-out LCL5 \
-remove \
-protocol 1000g2015aug_all,1000g2015aug_afr,1000g2015aug_amr,1000g2015aug_eas,1000g2015aug_eur,1000g2015aug_sas,clinvar_20190305,gnomad211_genome \
-operation f,f,f,f,f,f,f,f \
-nastring . \
-vcfinput \
--thread 8

rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL5.afterfilterdiffbed.vcf.gz -o LCL5_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL6.afterfilterdiffbed.vcf.gz -o LCL6_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL7.afterfilterdiffbed.vcf.gz -o LCL7_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL8.afterfilterdiffbed.vcf.gz -o LCL8_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/


zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l


zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l


zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l

bedtools subtract -a LCL5.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL5.27.homo_ref.consensus.filtereddiffbed.bed
bedtools subtract -a LCL6.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL6.27.homo_ref.consensus.filtereddiffbed.bed
bedtools subtract -a LCL7.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL7.27.homo_ref.consensus.filtereddiffbed.bed
bedtools subtract -a LCL8.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL8.27.homo_ref.consensus.filtereddiffbed.bed


python vcf2bed.py LCL5.body LCL5.variants.bed
python vcf2bed.py LCL6.body LCL6.variants.bed
python vcf2bed.py LCL7.body LCL7.variants.bed
python vcf2bed.py LCL8.body LCL8.variants.bed



cat /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/LCL5.variants.bed | cut -f1,11,12 | cat - LCL5.27.homo_ref.consensus.filtereddiffbed.bed | sort -k1,1 -k2,2n > LCL5.high.confidence.bed





+ 23
- 0
codescripts/contig.txt Vedi File

@@ -0,0 +1,23 @@
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>

+ 65
- 0
codescripts/extract_multiqc.py Vedi File

@@ -0,0 +1,65 @@
import json
import pandas as pd
import sys, argparse, os

parser = argparse.ArgumentParser(description="This script is to get information from multiqc")

parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt', required=True)
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True)
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)

args = parser.parse_args()

# Rename input:
fastqc_qualimap_file = args.fastqc_qualimap
fastqc_file = args.fastqc
fastqscreen_file = args.fastqscreen
hap_file = args.happy


# fastqc and qualimap
dat = pd.read_table(fastqc_qualimap_file)

fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
fastqc_stat = fastqc.dropna()

# qulimap
qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
qualimap_stat = qualimap.dropna()

# fastqc
dat = pd.read_table(fastqc_file)

fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample'])

# fastqscreen
dat = pd.read_table(fastqscreen_file)
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])

# benchmark
with open(hap_file) as hap_json:
happy = json.load(hap_json)
dat =pd.DataFrame.from_records(happy)
dat = dat.loc[:, dat.columns.str.endswith('ALL')]
dat_transposed = dat.T
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
benchmark.columns = ['Sample','Precision','Recall']

#output
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)







+ 92
- 0
codescripts/extract_vcf_information.py Vedi File

@@ -0,0 +1,92 @@
import sys,getopt
import os
import re
import fileinput
import pandas as pd

def usage():
print(
"""
Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file

This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.

Input:
-i the selected vcf file

Output:
-o tab-delimited parsed file
""")

# select supported small variants
def process(oneLine):
line = oneLine.rstrip()
strings = line.strip().split('\t')
infoParsed = parse_INFO(strings[7])
formatKeys = strings[8].split(':')
formatValues = strings[9].split(':')
for i in range(0,len(formatKeys) -1) :
if formatKeys[i] == 'AD':
ra = formatValues[i].split(',')
infoParsed['RefDP'] = ra[0]
infoParsed['AltDP'] = ra[1]
if (int(ra[1]) + int(ra[0])) != 0:
infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
else:
pass
else:
infoParsed[formatKeys[i]] = formatValues[i]
infoParsed['chromo'] = strings[0]
infoParsed['pos'] = strings[1]
infoParsed['id'] = strings[2]
infoParsed['ref'] = strings[3]
infoParsed['alt'] = strings[4]
infoParsed['qual'] = strings[5]
return infoParsed


def parse_INFO(info):
strings = info.strip().split(';')
keys = []
values = []
for i in strings:
kv = i.split('=')
if kv[0] == 'DB':
keys.append('DB')
values.append('1')
elif kv[0] == 'AF':
pass
else:
keys.append(kv[0])
values.append(kv[1])
infoDict = dict(zip(keys, values))
return infoDict

opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
for op,value in opts:
if op == "-i":
inputFile=value
elif op == "-o":
outputFile=value
elif op == "-h":
usage()
sys.exit()

if len(sys.argv[1:]) < 3:
usage()
sys.exit()

allDict = []
for line in fileinput.input(inputFile):
m = re.match('^\#',line)
if m is not None:
pass
else:
oneDict = process(line)
allDict.append(oneDict)

allTable = pd.DataFrame(allDict)

allTable.to_csv(outputFile,sep='\t',index=False)


+ 47
- 0
codescripts/filter_indel_over_50_cluster.py Vedi File

@@ -0,0 +1,47 @@
import sys,getopt
from itertools import islice

over_50_outfile = open("indel_lenth_over_50.txt",'w')
less_50_outfile = open("benchmark.men.vote.diffbed.lengthlessthan50.txt","w")

def process(line):
strings = line.strip().split('\t')
#d5
if ',' in strings[6]:
d5 = strings[6].split(',')
d5_len = [len(i) for i in d5]
d5_alt = max(d5_len)
else:
d5_alt = len(strings[6])
#d6
if ',' in strings[14]:
d6 = strings[14].split(',')
d6_len = [len(i) for i in d6]
d6_alt = max(d6_len)
else:
d6_alt = len(strings[14])
#f7
if ',' in strings[22]:
f7 = strings[22].split(',')
f7_len = [len(i) for i in f7]
f7_alt = max(f7_len)
else:
f7_alt = len(strings[22])
#m8
if ',' in strings[30]:
m8 = strings[30].split(',')
m8_len = [len(i) for i in m8]
m8_alt = max(m8_len)
else:
m8_alt = len(strings[30])
#ref
ref_len = len(strings[34])
if (d5_alt > 50) or (d6_alt > 50) or (f7_alt > 50) or (m8_alt > 50) or (ref_len > 50):
over_50_outfile.write(line)
else:
less_50_outfile.write(line)


input_file = open(sys.argv[1])
for line in islice(input_file, 1, None):
process(line)

+ 43
- 0
codescripts/filter_indel_over_50_mendelian.py Vedi File

@@ -0,0 +1,43 @@
from itertools import islice
import fileinput
import sys, argparse, os


# input arguments
parser = argparse.ArgumentParser(description="this script is to exclude indel over 50bp")

parser.add_argument('-i', '--mergedGVCF', type=str, help='merged gVCF txt with only chr, pos, ref, alt and genotypes', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True)


args = parser.parse_args()
input_dat = args.mergedGVCF
prefix = args.prefix


# output file
output_name = prefix + '.indel.lessthan50bp.txt'
outfile = open(output_name,'w')


def process(line):
strings = line.strip().split('\t')
#d5
if ',' in strings[3]:
alt = strings[3].split(',')
alt_len = [len(i) for i in alt]
alt_max = max(alt_len)
else:
alt_max = len(strings[3])
#ref
ref_len = len(strings[2])
if (alt_max > 50) or (ref_len > 50):
pass
else:
outfile.write(line)


for line in fileinput.input(input_dat):
process(line)



+ 36
- 0
codescripts/get_filtered_benchmark_vcfinfo.py Vedi File

@@ -0,0 +1,36 @@
from __future__ import division
import pandas as pd
import sys, argparse, os
import fileinput


# input arguments
parser = argparse.ArgumentParser(description="this script is to get filtered and benchmark vcf info")

parser.add_argument('-filtered', '--filtered', type=str, help='filtered position', required=True)
parser.add_argument('-benchmark', '--benchmark', type=str, help='benchmark position', required=True)
parser.add_argument('-vcf', '--vcf', type=str, help='one specific vcf', required=True)
parser.add_argument('-filename', '--filename', type=str, help='output file name', required=True)


args = parser.parse_args()
filtered = args.filtered
benchmark = args.benchmark
vcf = args.vcf
filename = args.filename


# output file
filtered_filename = filename + '.filtered.txt'
benchmark_filename = filename + '.benchmark.txt'

# input files
filtered_dat = pd.read_table(filtered,header=None)
benchmark_dat = pd.read_table(benchmark,header=None)
vcf_dat = pd.read_table(vcf)

filtered_merged_df = pd.merge(filtered_dat, vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS'])
benchmark_merged_df = pd.merge(benchmark_dat,vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS'])

filtered_merged_df.to_csv(filtered_filename,sep='\t',index=False)
benchmark_merged_df.to_csv(benchmark_filename,sep='\t',index=False)

+ 416
- 0
codescripts/high_confidence_call_vote.py Vedi File

@@ -0,0 +1,416 @@
from __future__ import division
import sys, argparse, os
import fileinput
import re
import pandas as pd
from operator import itemgetter
from collections import Counter
from itertools import islice
from numpy import *
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to count voting number")

parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True)
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)

args = parser.parse_args()
multi_sample_vcf = args.multi_sample_vcf
dup_list = args.dup_list
prefix = args.prefix
sample_name = args.sample_name

vcf_header = '''##fileformat=VCFv4.2
##fileDate=20200331
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region">
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes">
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes">
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples">
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples">
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency, sum alternative depth / sum depth">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Average genotype quality">
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth">
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Average mapping quality">
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias">
##FORMAT=<ID=QUALI,Number=1,Type=Float,Description="Average variant quality">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''

vcf_header_all_sample = '''##fileformat=VCFv4.2
##fileDate=20200331
##reference=GRCh38.d1.vd1
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region">
##INFO=<ID=DUP,Number=1,Type=Flag,Description="Duplicated variant records">
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes">
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes">
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
##INFO=<ID=ALL_ALT,Number=1,Type=Float,Description="Sum of alternative reads of all samples">
##INFO=<ID=ALL_DP,Number=1,Type=Float,Description="Sum of depth of all samples">
##INFO=<ID=ALL_AF,Number=1,Type=Float,Description="Allele frequency of net alternatice reads and net depth">
##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of genotype quality of all samples">
##INFO=<ID=QD_MEAN,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth">
##INFO=<ID=MQ_MEAN,Number=1,Type=Float,Description="Mean of mapping quality of all samples">
##INFO=<ID=FS_MEAN,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=QUAL_MEAN,Number=1,Type=Float,Description="Average variant quality">
##INFO=<ID=PCR,Number=1,Type=String,Description="Consensus of PCR votes">
##INFO=<ID=PCR_FREE,Number=1,Type=String,Description="Consensus of PCR-free votes">
##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Consensus calls">
##INFO=<ID=CONSENSUS_SEQ,Number=1,Type=String,Description="Consensus sequence">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=String,Description="Depth">
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth">
##FORMAT=<ID=AF,Number=1,Type=String,Description="Allele frequency">
##FORMAT=<ID=GQ,Number=1,Type=String,Description="Genotype quality">
##FORMAT=<ID=MQ,Number=1,Type=String,Description="Mapping quality">
##FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 is twins shared, 0 is twins discordant ">
##FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL5 mendelian consistent, 0 is mendelian vioaltion">
##FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL6 mendelian consistent, 0 is mendelian vioaltion">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# read in duplication list
dup = pd.read_table(dup_list,header=None)
var_dup = dup[0].tolist()

# output file
benchmark_file_name = prefix + '_voted.vcf'
benchmark_outfile = open(benchmark_file_name,'w')

all_sample_file_name = prefix + '_all_sample_information.vcf'
all_sample_outfile = open(all_sample_file_name,'w')

# write VCF
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '_benchmark_calls\n'
benchmark_outfile.write(vcf_header)
benchmark_outfile.write(outputcolumn)

outputcolumn_all_sample = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+ \
'Quartet_DNA_BGI_SEQ2000_BGI_1_20180518\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530\t' + \
'Quartet_DNA_BGI_T7_WGE_1_20191105\tQuartet_DNA_BGI_T7_WGE_2_20191105\tQuartet_DNA_BGI_T7_WGE_3_20191105\t' + \
'Quartet_DNA_ILM_Nova_ARD_1_20181108\tQuartet_DNA_ILM_Nova_ARD_2_20181108\tQuartet_DNA_ILM_Nova_ARD_3_20181108\t' + \
'Quartet_DNA_ILM_Nova_ARD_4_20190111\tQuartet_DNA_ILM_Nova_ARD_5_20190111\tQuartet_DNA_ILM_Nova_ARD_6_20190111\t' + \
'Quartet_DNA_ILM_Nova_BRG_1_20180930\tQuartet_DNA_ILM_Nova_BRG_2_20180930\tQuartet_DNA_ILM_Nova_BRG_3_20180930\t' + \
'Quartet_DNA_ILM_Nova_WUX_1_20190917\tQuartet_DNA_ILM_Nova_WUX_2_20190917\tQuartet_DNA_ILM_Nova_WUX_3_20190917\t' + \
'Quartet_DNA_ILM_XTen_ARD_1_20170403\tQuartet_DNA_ILM_XTen_ARD_2_20170403\tQuartet_DNA_ILM_XTen_ARD_3_20170403\t' + \
'Quartet_DNA_ILM_XTen_NVG_1_20170329\tQuartet_DNA_ILM_XTen_NVG_2_20170329\tQuartet_DNA_ILM_XTen_NVG_3_20170329\t' + \
'Quartet_DNA_ILM_XTen_WUX_1_20170216\tQuartet_DNA_ILM_XTen_WUX_2_20170216\tQuartet_DNA_ILM_XTen_WUX_3_20170216\n'
all_sample_outfile.write(vcf_header_all_sample)
all_sample_outfile.write(outputcolumn_all_sample)



#function
def replace_nan(strings_list):
updated_list = []
for i in strings_list:
if i == '.':
updated_list.append('.:.:.:.:.:.:.:.:.:.:.:.')
else:
updated_list.append(i)
return updated_list

def remove_dot(strings_list):
updated_list = []
for i in strings_list:
if i == '.':
pass
else:
updated_list.append(i)
return updated_list

def detected_number(strings):
gt = [x.split(':')[0] for x in strings]
percentage = 27 - gt.count('.')
return(str(percentage))

def vote_number(strings,consensus_call):
gt = [x.split(':')[0] for x in strings]
gt = [x.replace('.','0/0') for x in gt]
gt = list(map(gt_uniform,[i for i in gt]))
vote_num = gt.count(consensus_call)
return(str(vote_num))

def family_vote(strings,consensus_call):
gt = [x.split(':')[0] for x in strings]
gt = [x.replace('.','0/0') for x in gt]
gt = list(map(gt_uniform,[i for i in gt]))
mendelian = [':'.join(x.split(':')[1:4]) for x in strings]
indices = [i for i, x in enumerate(gt) if x == consensus_call]
matched_mendelian = itemgetter(*indices)(mendelian)
mendelian_num = matched_mendelian.count('1:1:1')
return(str(mendelian_num))

def gt_uniform(strings):
uniformed_gt = ''
allele1 = strings.split('/')[0]
allele2 = strings.split('/')[1]
if int(allele1) > int(allele2):
uniformed_gt = allele2 + '/' + allele1
else:
uniformed_gt = allele1 + '/' + allele2
return uniformed_gt

def decide_by_rep(strings):
consensus_rep = ''
mendelian = [':'.join(x.split(':')[1:4]) for x in strings]
gt = [x.split(':')[0] for x in strings]
gt = [x.replace('.','0/0') for x in gt]
# modified gt turn 2/1 to 1/2
gt = list(map(gt_uniform,[i for i in gt]))
# mendelian consistent?
mendelian_dict = Counter(mendelian)
highest_mendelian = mendelian_dict.most_common(1)
candidate_mendelian = highest_mendelian[0][0]
freq_mendelian = highest_mendelian[0][1]
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
gt_num_dict = Counter(gt)
highest_gt = gt_num_dict.most_common(1)
candidate_gt = highest_gt[0][0]
freq_gt = highest_gt[0][1]
if (candidate_gt != '0/0') and (freq_gt >= 2):
consensus_rep = candidate_gt
elif (candidate_gt == '0/0') and (freq_gt >= 2):
consensus_rep = '0/0'
else:
consensus_rep = 'inconGT'
elif (candidate_mendelian == '') and (freq_mendelian >= 2):
consensus_rep = 'noInfo'
else:
consensus_rep = 'inconMen'
return consensus_rep


def main():
for line in fileinput.input(multi_sample_vcf):
headline = re.match('^\#',line)
if headline is not None:
pass
else:
line = line.strip()
strings = line.split('\t')
variant_id = '_'.join([strings[0],strings[1]])
# check if the variants location is duplicated
if variant_id in var_dup:
strings[7] = strings[7] + ';DUP'
outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(outLine)
else:
# pre-define
pcr_consensus = '.'
pcr_free_consensus = '.'
consensus_call = '.'
consensus_alt_seq = '.'
# pcr
strings[9:] = replace_nan(strings[9:])
pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings)
SEQ2000 = decide_by_rep(pcr[0:3])
XTen_ARD = decide_by_rep(pcr[3:6])
XTen_NVG = decide_by_rep(pcr[6:9])
XTen_WUX = decide_by_rep(pcr[9:12])
sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
sequence_dict = Counter(sequence_site)
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 2:
pcr_consensus = candidate_sequence
else:
pcr_consensus = 'inconSequenceSite'
# pcr-free
pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings)
T7_WGE = decide_by_rep(pcr_free[0:3])
Nova_ARD_1 = decide_by_rep(pcr_free[3:6])
Nova_ARD_2 = decide_by_rep(pcr_free[6:9])
Nova_BRG = decide_by_rep(pcr_free[9:12])
Nova_WUX = decide_by_rep(pcr_free[12:15])
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 3:
pcr_free_consensus = candidate_sequence
else:
pcr_free_consensus = 'inconSequenceSite'
# pcr and pcr-free
tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
consensus_call = pcr_consensus
VOTED = vote_number(strings[9:],consensus_call)
strings[7] = strings[7] + ';VOTED=' + VOTED
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
FAM = family_vote(strings[9:],consensus_call)
strings[7] = strings[7] + ';FAM=' + FAM
# Delete multiple alternative genotype to necessary expression
alt = strings[4]
alt_gt = alt.split(',')
if len(alt_gt) > 1:
allele1 = consensus_call.split('/')[0]
allele2 = consensus_call.split('/')[1]
if allele1 == '0':
allele2_seq = alt_gt[int(allele2) - 1]
consensus_alt_seq = allele2_seq
consensus_call = '0/1'
else:
allele1_seq = alt_gt[int(allele1) - 1]
allele2_seq = alt_gt[int(allele2) - 1]
if int(allele1) > int(allele2):
consensus_alt_seq = allele2_seq + ',' + allele1_seq
consensus_call = '1/2'
elif int(allele1) < int(allele2):
consensus_alt_seq = allele1_seq + ',' + allele2_seq
consensus_call = '1/2'
else:
consensus_alt_seq = allele1_seq
consensus_call = '1/1'
else:
consensus_alt_seq = alt
# GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL
# GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT
# DP
DP = [x.split(':')[4] for x in strings[9:]]
DP = remove_dot(DP)
DP = [int(x) for x in DP]
ALL_DP = sum(DP)
# AF
ALT = [x.split(':')[5] for x in strings[9:]]
ALT = remove_dot(ALT)
ALT = [int(x) for x in ALT]
ALL_ALT = sum(ALT)
ALL_AF = round(ALL_ALT/ALL_DP,2)
# GQ
GQ = [x.split(':')[7] for x in strings[9:]]
GQ = remove_dot(GQ)
GQ = [int(x) for x in GQ]
GQ_MEAN = round(mean(GQ),2)
# QD
QD = [x.split(':')[8] for x in strings[9:]]
QD = remove_dot(QD)
QD = [float(x) for x in QD]
QD_MEAN = round(mean(QD),2)
# MQ
MQ = [x.split(':')[9] for x in strings[9:]]
MQ = remove_dot(MQ)
MQ = [float(x) for x in MQ]
MQ_MEAN = round(mean(MQ),2)
# FS
FS = [x.split(':')[10] for x in strings[9:]]
FS = remove_dot(FS)
FS = [float(x) for x in FS]
FS_MEAN = round(mean(FS),2)
# QUAL
QUAL = [x.split(':')[11] for x in strings[9:]]
QUAL = remove_dot(QUAL)
QUAL = [float(x) for x in QUAL]
QUAL_MEAN = round(mean(QUAL),2)
# benchmark output
output_format = consensus_call + ':' + str(ALL_DP) + ':' + str(ALL_ALT) + ':' + str(ALL_AF) + ':' + str(GQ_MEAN) + ':' + str(QD_MEAN) + ':' + str(MQ_MEAN) + ':' + str(FS_MEAN) + ':' + str(QUAL_MEAN)
outLine = strings[0] + '\t' + strings[1] + '\t' + strings[2] + '\t' + strings[3] + '\t' + consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' + strings[7] + '\t' + 'GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + output_format + '\n'
benchmark_outfile.write(outLine)
# all sample output
strings[7] = strings[7] + ';ALL_ALT=' + str(ALL_ALT) + ';ALL_DP=' + str(ALL_DP) + ';ALL_AF=' + str(ALL_AF) \
+ ';GQ_MEAN=' + str(GQ_MEAN) + ';QD_MEAN=' + str(QD_MEAN) + ';MQ_MEAN=' + str(MQ_MEAN) + ';FS_MEAN=' + str(FS_MEAN) \
+ ';QUAL_MEAN=' + str(QUAL_MEAN) + ';PCR=' + consensus_call + ';PCR_FREE=' + consensus_call + ';CONSENSUS=' + consensus_call \
+ ';CONSENSUS_SEQ=' + consensus_alt_seq
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
consensus_call = 'filtered'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
consensus_call = 'pcr-free-speicifc'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
consensus_call = 'pcr-speicifc'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call + ';PCR=' + pcr_consensus + ';PCR_FREE=' + pcr_free_consensus
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
consensus_call = 'confirm for parents'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)
else:
consensus_call = 'filtered'
DETECTED = detected_number(strings[9:])
strings[7] = strings[7] + ';DETECTED=' + DETECTED
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
all_sample_outLine = '\t'.join(strings) + '\n'
all_sample_outfile.write(all_sample_outLine)

if __name__ == '__main__':
main()













+ 42
- 0
codescripts/high_voted_mendelian_bed.py Vedi File

@@ -0,0 +1,42 @@
import pandas as pd
import sys, argparse, os
mut = pd.read_table(sys.argv[1])
outFile = open(sys.argv[2],'w')
for row in mut.itertuples():
#d5
if ',' in row.V4:
alt = row.V4.split(',')
alt_len = [len(i) for i in alt]
alt_max = max(alt_len)
else:
alt_max = len(row.V4)
#d6
alt = alt_max
ref = row.V3
pos = int(row.V2)
if len(ref) == 1 and alt == 1:
StartPos = int(pos) -1
EndPos = int(pos)
cate = 'SNV'
elif len(ref) > alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(ref) - 1)
cate = 'INDEL'
elif alt > len(ref):
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
cate = 'INDEL'
elif len(ref) == alt:
StartPos = int(pos) - 1
EndPos = int(pos) + (alt - 1)
cate = 'INDEL'
outline = row.V1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row.V2) + '\t' + cate + '\n'
outFile.write(outline)









+ 50
- 0
codescripts/how_many_samples.py Vedi File

@@ -0,0 +1,50 @@
import pandas as pd
import sys, argparse, os
from operator import itemgetter

parser = argparse.ArgumentParser(description="This script is to get how many samples")

parser.add_argument('-sample', '--sample', type=str, help='quartet_sample', required=True)
parser.add_argument('-rep', '--rep', type=str, help='quartet_rep', required=True)
args = parser.parse_args()

# Rename input:
sample = args.sample
rep = args.rep

quartet_sample = pd.read_table(sample,header=None)
quartet_sample = list(quartet_sample[0])
quartet_rep = pd.read_table(rep,header=None)
quartet_rep = quartet_rep[0]

#tags
sister_tag = 'false'
quartet_tag = 'false'

quartet_rep_unique = list(set(quartet_rep))

single_rep = [i for i in range(len(quartet_rep)) if quartet_rep[i] == quartet_rep_unique[0]]
single_batch_sample = itemgetter(*single_rep)(quartet_sample)

num = len(single_batch_sample)
if num == 1:
sister_tag = 'false'
quartet_tag = 'false'
elif num == 2:
if set(single_batch_sample) == set(['LCL5','LCL6']):
sister_tag = 'true'
quartet_tag = 'false'
elif num == 3:
if ('LCL5' in single_batch_sample) and ('LCL6' in single_batch_sample):
sister_tag = 'true'
quartet_tag = 'false'
elif num == 4:
if set(single_batch_sample) == set(['LCL5','LCL6','LCL7','LCL8']):
sister_tag = 'false'
quartet_tag = 'true'

sister_outfile = open('sister_tag','w')
quartet_outfile = open('quartet_tag','w')

sister_outfile.write(sister_tag)
quartet_outfile.write(quartet_tag)

+ 42
- 0
codescripts/lcl5_all_called_variants.py Vedi File

@@ -0,0 +1,42 @@
from __future__ import division
import sys, argparse, os
import pandas as pd
from collections import Counter

# input arguments
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True)


args = parser.parse_args()
vcf = args.vcf

lcl5_outfile = open('LCL5_all_variants.txt','w')
filtered_outfile = open('LCL5_filtered_variants.txt','w')

vcf_dat = pd.read_table(vcf)


for row in vcf_dat.itertuples():
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
lcl5_vcf_gt = [x.split(':')[0] for x in lcl5_list]
lcl5_gt=[item.replace('./.', '0/0') for item in lcl5_vcf_gt]
gt_dict = Counter(lcl5_gt)
highest_gt = gt_dict.most_common(1)
candidate_gt = highest_gt[0][0]
freq_gt = highest_gt[0][1]
output = row._1 + '\t' + str(row.POS) + '\t' + '\t'.join(lcl5_gt) + '\n'
if (candidate_gt == '0/0') and (freq_gt == 27):
filtered_outfile.write(output)
else:
lcl5_outfile.write(output)



+ 6
- 0
codescripts/linux_command.sh Vedi File

@@ -0,0 +1,6 @@
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '2_y' > LCL5.body
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '2_y' > LCL6.body
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '2_y' > LCL7.body
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}' | grep -v '2_y' > LCL8.body

for i in *txt; do cat $i | awk '{ if ((length($3) == 1) && (length($4) == 1)) { print } }' | grep -v '#' | cut -f3,4 | sort |uniq -c | sed 's/\s\+/\t/g' | cut -f2 > $i.mut; done

+ 129
- 0
codescripts/merge_mendelian_vcfinfo.py Vedi File

@@ -0,0 +1,129 @@
from __future__ import division
import pandas as pd
import sys, argparse, os
import fileinput
import re

# input arguments
parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates")

parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True)
parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True)
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)


args = parser.parse_args()
vcfInfo = args.vcfInfo
mendelianInfo = args.mendelianInfo
sample_name = args.sample_name


#GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL

vcf_header = '''##fileformat=VCFv4.2
##fileDate=20200331
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=TWINS,Number=1,Type=Flag,Description="1 for sister consistent, 0 for sister different">
##FORMAT=<ID=TRIO5,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation">
##FORMAT=<ID=TRIO6,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth">
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality">
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality">
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="variant quality">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''

# output file
file_name = sample_name + '_mendelian_vcfInfo.vcf'
outfile = open(file_name,'w')

outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '\n'
outfile.write(vcf_header)
outfile.write(outputcolumn)

# input files
vcf_info = pd.read_table(vcfInfo)
mendelian_info = pd.read_table(mendelianInfo)

merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])
merged_df = merged_df.fillna('.')

#
def parse_INFO(info):
strings = info.strip().split(';')
keys = []
values = []
for i in strings:
kv = i.split('=')
if kv[0] == 'DB':
keys.append('DB')
values.append('1')
else:
keys.append(kv[0])
values.append(kv[1])
infoDict = dict(zip(keys, values))
return infoDict
#
for row in merged_df.itertuples():
if row[18] != '.':
# format
# GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL
FORMAT_x = row[10].split(':')
ALT = int(FORMAT_x[1].split(',')[1])
if int(FORMAT_x[2]) != 0:
AF = round(ALT/int(FORMAT_x[2]),2)
else:
AF = '.'
INFO_x = parse_INFO(row.INFO_x)
if FORMAT_x[2] == '0':
INFO_x['QD'] = '.'
else:
pass
FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x)
# outline
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n'
else:
rawGT = row[10].split(':')
FORMAT_x = row[10].split(':')
ALT = int(FORMAT_x[1].split(',')[1])
if int(FORMAT_x[2]) != 0:
AF = round(ALT/int(FORMAT_x[2]),2)
else:
AF = '.'
INFO_x = parse_INFO(row.INFO_x)
if FORMAT_x[2] == '0':
INFO_x['QD'] = '.'
else:
pass
FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0]
# outline
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n'
outfile.write(outline)

+ 71
- 0
codescripts/merge_two_family.py Vedi File

@@ -0,0 +1,71 @@
from __future__ import division
import pandas as pd
import sys, argparse, os
import fileinput
import re

# input arguments
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information")

parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True)
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True)
parser.add_argument('-family', '--family', type=str, help='family name', required=True)


args = parser.parse_args()
lcl5 = args.LCL5
lcl6 = args.LCL6
family = args.family


# output file
family_name = family + '.txt'

family_file = open(family_name,'w')

# input files
lcl5_dat = pd.read_table(lcl5)
lcl6_dat = pd.read_table(lcl6)

merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])

def alt_seq(alt, genotype):
if genotype == './.':
seq = './.'
elif genotype == '0/0':
seq = '0/0'
else:
alt = alt.split(',')
genotype = genotype.split('/')
if genotype[0] == '0':
allele2 = alt[int(genotype[1]) - 1]
seq = '0/' + allele2
else:
allele1 = alt[int(genotype[0]) - 1]
allele2 = alt[int(genotype[1]) - 1]
seq = allele1 + '/' + allele2
return seq

for row in merged_df.itertuples():
# correction of multiallele
if pd.isnull(row.INFO_x) == True or pd.isnull(row.INFO_y) == True:
mendelian = '.'
else:
lcl5_seq = alt_seq(row.ALT_x, row.CHILD_x)
lcl6_seq = alt_seq(row.ALT_y, row.CHILD_y)
if lcl5_seq == lcl6_seq:
mendelian = '1'
else:
mendelian = '0'
if pd.isnull(row.INFO_x) == True:
mendelian = mendelian + ':.'
else:
mendelian = mendelian + ':' + row.INFO_x.split('=')[1]
if pd.isnull(row.INFO_y) == True:
mendelian = mendelian + ':.'
else:
mendelian = mendelian + ':' + row.INFO_y.split('=')[1]


outline = row._1 + '\t' + str(row.POS) + '\t' + mendelian + '\n'
family_file.write(outline)

+ 115
- 0
codescripts/merge_two_family_with_genotype.py Vedi File

@@ -0,0 +1,115 @@
from __future__ import division
import pandas as pd
import sys, argparse, os
import fileinput
import re

# input arguments
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information")

parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True)
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True)
parser.add_argument('-genotype', '--genotype', type=str, help='Genotype information of a set of four family members', required=True)
parser.add_argument('-family', '--family', type=str, help='family name', required=True)


args = parser.parse_args()
lcl5 = args.LCL5
lcl6 = args.LCL6
genotype = args.genotype
family = args.family


# output file
family_name = family + '.txt'

family_file = open(family_name,'w')

summary_name = family + '.summary.txt'

summary_file = open(summary_name,'w')

# input files
lcl5_dat = pd.read_table(lcl5)
lcl6_dat = pd.read_table(lcl6)
genotype_dat = pd.read_table(genotype)
merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])
merged_genotype_df = pd.merge(merged_df, genotype_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])

merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,23,24,29,30,31,32,7,17]]
merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6']

sister_same = 0
sister_diff = 0
family_all = 0
family_mendelian = 0

for row in merged_genotype_df_sub.itertuples():
# sister
if row.LCL5 == row.LCL6:
if row.LCL5 == './.':
mendelian = 'noInfo'
sister_count = "no"
elif row.LCL5 == '0/0':
mendelian = 'Ref'
sister_count = "no"
else:
mendelian = '1'
sister_count = "yes_same"
else:
mendelian = '0'
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'):
sister_count = "no"
else:
sister_count = "yes_diff"
if sister_count == 'yes_same':
sister_same += 1
elif sister_count == 'yes_diff':
sister_diff += 1
else:
pass
# family trio5
if row.LCL5 == row. LCL7 == row.LCL8 == './.':
mendelian = mendelian + ':noInfo'
elif row.LCL5 == row. LCL7 == row.LCL8 == '0/0':
mendelian = mendelian + ':Ref'
elif pd.isnull(row.TRIO5) == True:
mendelian = mendelian + ':unVBT'
else:
mendelian = mendelian + ':' + row.TRIO5.split('=')[1]
# family trio6
if row.LCL6 == row.LCL7 == row.LCL8 == './.':
mendelian = mendelian + ':noInfo'
elif row.LCL6 == row. LCL7 == row.LCL8 == '0/0':
mendelian = mendelian + ':Ref'
elif pd.isnull(row.TRIO6) == True:
mendelian = mendelian + ':unVBT'
else:
mendelian = mendelian + ':' + row.TRIO6.split('=')[1]
# not count into family
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0') and (row.LCL7 == './.' or row.LCL7 == '0/0') and (row.LCL8 == './.' or row.LCL8 == '0/0'):
mendelian_count = "no"
else:
mendelian_count = "yes"
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n'
family_file.write(outline)
if mendelian_count == 'yes':
family_all += 1
else:
pass
if mendelian == '1:1:1':
family_mendelian += 1
elif mendelian == 'Ref:1:1':
family_mendelian += 1
else:
pass

sister = sister_same/(sister_same + sister_diff)
quartet = family_mendelian/family_all
outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n'
outResult = family + '\t' + str(sister) + '\t' + str(quartet) + '\n'
summary_file.write(outcolumn)
summary_file.write(outResult)




+ 109
- 0
codescripts/oneClass.py Vedi File

@@ -0,0 +1,109 @@
# import modules
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import preprocessing
import sys, argparse, os
from vcf2bed import position_to_bed,padding_region



parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")

parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True)
parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True)
parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True)

args = parser.parse_args()

# Rename input:
train_input = args.trainDataset
test_input = args.testDataset
sample_name = args.sampleName

# default columns, which will be included in the included in the calssifier
chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY']
feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']


# import datasets sepearate the records with or without BaseQRankSum annotation, etc.
def load_dat(dat_file_name):
dat = pd.read_table(dat_file_name)
dat['DB'] = dat['DB'].fillna(0)
dat = dat[dat['DP'] != 0]
dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
homo_rows = dat[dat['BaseQRankSum'].isnull()]
heter_rows = dat[dat['BaseQRankSum'].notnull()]
return homo_rows,heter_rows


train_homo,train_heter = load_dat(train_input)
test_homo,test_heter = load_dat(test_input)
clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500)

def prepare_dat(train_dat,test_dat,feature_cols,chromo):
chr_train = train_dat[train_dat['chromo'] == chromo]
chr_test = test_dat[test_dat['chromo'] == chromo]
train_dat = chr_train.loc[:,feature_cols]
test_dat = chr_test.loc[:,feature_cols]
train_dat_scaled = preprocessing.scale(train_dat)
test_dat_scaled = preprocessing.scale(test_dat)
return chr_test,train_dat_scaled,test_dat_scaled

def oneclass(X_train,X_test,chr_test):
clf.fit(X_train)
y_pred_test = clf.predict(X_test)
test_true_dat = chr_test[y_pred_test == 1]
test_false_dat = chr_test[y_pred_test == -1]
return test_true_dat,test_false_dat

predicted_true = pd.DataFrame(columns=train_homo.columns)
predicted_false = pd.DataFrame(columns=train_homo.columns)

for chromo in chromosome:
# homo datasets
chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
predicted_true = predicted_true.append(test_true_homo)
predicted_false = predicted_false.append(test_false_homo)
# heter datasets
chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
predicted_true = predicted_true.append(test_true_heter)
predicted_false = predicted_false.append(test_false_heter)

predicted_true_filename = sample_name + '_predicted_true.txt'
predicted_false_filename = sample_name + '_predicted_false.txt'

predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)

# output the bed file and padding bed region 50bp

predicted_true_bed_filename = sample_name + '_predicted_true.bed'
predicted_false_bed_filename = sample_name + '_predicted_false.bed'
padding_filename = sample_name + '_padding.bed'

predicted_true_bed = open(predicted_true_bed_filename,'w')
predicted_false_bed = open(predicted_false_bed_filename,'w')
padding = open(padding_filename,'w')

#
for index,row in predicted_false.iterrows():
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
predicted_false_bed.write(outline_pos)
chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
padding.write(outline_pad_1)
padding.write(outline_pad_2)

for index,row in predicted_true.iterrows():
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
predicted_true_bed.write(outline_pos)



+ 144
- 0
codescripts/reformVCF.py Vedi File

@@ -0,0 +1,144 @@
# import modules
import sys, argparse, os
import fileinput
import re

parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style")

parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True)
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True)

args = parser.parse_args()

# Rename input:
inputFile = args.familyVCF
family_name = args.familyName

# output filename
LCL5_name = family_name + '.LCL5.vcf'
LCL5file = open(LCL5_name,'w')
LCL6_name = family_name + '.LCL6.vcf'
LCL6file = open(LCL6_name,'w')
LCL7_name = family_name + '.LCL7.vcf'
LCL7file = open(LCL7_name,'w')
LCL8_name = family_name + '.LCL8.vcf'
LCL8file = open(LCL8_name,'w')
family_filename = family_name + '.vcf'
familyfile = open(family_filename,'w')

# default columns, which will be included in the included in the calssifier
vcfheader = '''##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# write VCF
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n'
LCL5file.write(vcfheader)
LCL5file.write(LCL5colname)

LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n'
LCL6file.write(vcfheader)
LCL6file.write(LCL6colname)

LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n'
LCL7file.write(vcfheader)
LCL7file.write(LCL7colname)

LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n'
LCL8file.write(vcfheader)
LCL8file.write(LCL8colname)

familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n'
familyfile.write(vcfheader)
familyfile.write(familycolname)

# reform VCF
def process(oneLine):
line = oneLine.rstrip()
strings = line.strip().split('\t')
# replace .
# LCL6 uniq
if strings[11] == '.':
strings[11] = '0/0'
strings[9] = strings[12]
strings[10] = strings[13]
else:
pass
# LCL5 uniq
if strings[14] == '.':
strings[14] = '0/0'
strings[12] = strings[9]
strings[13] = strings[10]
else:
pass
# sister
if strings[11] == strings[14]:
add_format = ":1"
else:
add_format = ":0"
# trioLCL5
if strings[15] == 'MD=1':
add_format = add_format + ":1"
else:
add_format = add_format + ":0"
# trioLCL6
if strings[7] == 'MD=1':
add_format = add_format + ":1"
else:
add_format = add_format + ":0"
# filter
if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'):
strings[6] = 'PASS'
else:
strings[6] = '.'
# output LCL5
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n'
LCL5file.write(LCL5outLine)
# output LCL6
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n'
LCL6file.write(LCL6outLine)
# output LCL7
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n'
LCL7file.write(LCL7outLine)
# output LCL8
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n'
LCL8file.write(LCL8outLine)
# output family
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n'
familyfile.write(familyoutLine)


for line in fileinput.input(inputFile):
m = re.match('^\#',line)
if m is not None:
pass
else:
process(line)


+ 227
- 0
codescripts/replicates_consensus.py Vedi File

@@ -0,0 +1,227 @@
from __future__ import division
from glob import glob
import sys, argparse, os
import fileinput
import re
import pandas as pd
from operator import itemgetter
from collections import Counter
from itertools import islice
from numpy import *
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")

parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True)
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True)


args = parser.parse_args()
prefix = args.prefix
vcf = args.vcf

# input files
vcf_dat = pd.read_table(vcf)

# all info
all_file_name = prefix + "_all_summary.txt"
all_sample_outfile = open(all_file_name,'w')
all_info_col = 'CHROM\tPOS\tREF\tALT\tLCL5_consensus_calls\tLCL5_detect_number\tLCL5_same_diff\tLCL6_consensus_calls\tLCL6_detect_number\tLCl6_same_diff\tLCL7_consensus_calls\tLCL7_detect_number\tLCL7_same_diff\tLCL8_consensus_calls\tLCL8_detect_number\tLCL8_same_diff\n'
all_sample_outfile.write(all_info_col)

# filtered info
vcf_header = '''##fileformat=VCFv4.2
##fileDate=20200501
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
consensus_file_name = prefix + "_consensus.vcf"
consensus_outfile = open(consensus_file_name,'w')
consensus_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_consensus_call\tLCL6_consensus_call\tLCL7_consensus_call\tLCL8_consensus_call\n'
consensus_outfile.write(vcf_header)
consensus_outfile.write(consensus_col)

# function
def decide_by_rep(vcf_list):
consensus_rep = ''
gt = [x.split(':')[0] for x in vcf_list]
gt_num_dict = Counter(gt)
highest_gt = gt_num_dict.most_common(1)
candidate_gt = highest_gt[0][0]
freq_gt = highest_gt[0][1]
if freq_gt >= 2:
consensus_rep = candidate_gt
else:
consensus_rep = 'inconGT'
return consensus_rep

def consensus_call(vcf_info_list):
consensus_call = '.'
detect_number = '.'
same_diff = '.'
# pcr
SEQ2000 = decide_by_rep(vcf_info_list[0:3])
XTen_ARD = decide_by_rep(vcf_info_list[18:21])
XTen_NVG = decide_by_rep(vcf_info_list[21:24])
XTen_WUX = decide_by_rep(vcf_info_list[24:27])
Nova_WUX = decide_by_rep(vcf_info_list[15:18])
pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX,Nova_WUX]
pcr_sequence_dict = Counter(pcr_sequence_site)
pcr_highest_sequence = pcr_sequence_dict.most_common(1)
pcr_candidate_sequence = pcr_highest_sequence[0][0]
pcr_freq_sequence = pcr_highest_sequence[0][1]
if pcr_freq_sequence > 3:
pcr_consensus = pcr_candidate_sequence
else:
pcr_consensus = 'inconSequenceSite'
# pcr-free
T7_WGE = decide_by_rep(vcf_info_list[3:6])
Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9])
Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12])
Nova_BRG = decide_by_rep(vcf_info_list[12:15])
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
sequence_dict = Counter(sequence_site)
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 2:
pcr_free_consensus = candidate_sequence
else:
pcr_free_consensus = 'inconSequenceSite'
gt = [x.split(':')[0] for x in vcf_info_list]
gt = [x.replace('./.','.') for x in gt]
detected_num = 27 - gt.count('.')
gt_remain = [e for e in gt if e not in {'.'}]
gt_set = set(gt_remain)
if len(gt_set) == 1:
same_diff = 'same'
else:
same_diff = 'diff'
tag = ['inconGT','inconSequenceSite']
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag):
consensus_call = pcr_consensus
elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
consensus_call = 'notAgree'

else:
consensus_call = 'notConsensus'
return consensus_call, detected_num, same_diff

elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
consensus_call = 'filtered'
elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')):
consensus_call = 'pcr-free-speicifc'
elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')):
consensus_call = 'pcr-speicifc'
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
consensus_call = '0/0'
else:
consensus_call = 'filtered'


for row in vcf_dat.itertuples():
# length
#alt
if ',' in row.ALT:
alt = row.ALT.split(',')
alt_len = [len(i) for i in alt]
alt_max = max(alt_len)
else:
alt_max = len(row.ALT)
#ref
ref_len = len(row.REF)
if (alt_max > 50) or (ref_len > 50):
pass
else:
# consensus
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216]
lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216]
lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216]
# LCL5
LCL5_consensus_call, LCL5_detected_num, LCL5_same_diff = consensus_call(lcl5_list)
# LCL6
LCL6_consensus_call, LCL6_detected_num, LCL6_same_diff = consensus_call(lcl6_list)
# LCL7
LCL7_consensus_call, LCL7_detected_num, LCL7_same_diff = consensus_call(lcl7_list)
# LCL8
LCL8_consensus_call, LCL8_detected_num, LCL8_same_diff = consensus_call(lcl8_list)
# all data
all_output = row._1 + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + LCL5_consensus_call + '\t' + str(LCL5_detected_num) + '\t' + LCL5_same_diff + '\t' +\
LCL6_consensus_call + '\t' + str(LCL6_detected_num) + '\t' + LCL6_same_diff + '\t' +\
LCL7_consensus_call + '\t' + str(LCL7_detected_num) + '\t' + LCL7_same_diff + '\t' +\
LCL8_consensus_call + '\t' + str(LCL8_detected_num) + '\t' + LCL8_same_diff + '\n'
all_sample_outfile.write(all_output)
#consensus vcf
one_position = [LCL5_consensus_call,LCL6_consensus_call,LCL7_consensus_call,LCL8_consensus_call]
if ('notConsensus' in one_position) or (((len(set(one_position)) == 1) and ('./.' in set(one_position))) or ((len(set(one_position)) == 1) and ('0/0' in set(one_position))) or ((len(set(one_position)) == 2) and ('0/0' in set(one_position) and ('./.' in set(one_position))))):
pass
else:
consensus_output = row._1 + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + row.ALT + '\t' + '.' + '\t' + '.' + '\t' +'.' + '\t' + 'GT' + '\t' + LCL5_consensus_call + '\t' + LCL6_consensus_call + '\t' + LCL7_consensus_call + '\t' + LCL8_consensus_call +'\n'
consensus_outfile.write(consensus_output)









+ 36
- 0
codescripts/vcf2bed.py Vedi File

@@ -0,0 +1,36 @@
import re

def position_to_bed(chromo,pos,ref,alt):
# snv
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF

if len(ref) == 1 and len(alt) == 1:
StartPos = int(pos) -1
EndPos = int(pos)
# deletions
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF + (reference length - alternate length)

elif len(ref) > 1 and len(alt) == 1:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(ref) - 1)
#insertions
# For insertions:
# Start cooridinate BED = start coordinate VCF - 1
# End cooridinate BED = start coordinate VCF + (alternate length - reference length)

else:
StartPos = int(pos) - 1
EndPos = int(pos) + (len(alt) - 1)

return chromo,StartPos,EndPos

def padding_region(chromo,pos1,pos2,padding):
StartPos1 = pos1 - padding
EndPos1 = pos1
StartPos2 = pos2
EndPos2 = pos2 + padding
return chromo,StartPos1,EndPos1,StartPos2,EndPos2

+ 295
- 0
codescripts/voted_by_vcfinfo_mendelianinfo.py Vedi File

@@ -0,0 +1,295 @@
from __future__ import division
from glob import glob
import sys, argparse, os
import fileinput
import re
import pandas as pd
from operator import itemgetter
from collections import Counter
from itertools import islice
from numpy import *
import statistics

# input arguments
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")

parser.add_argument('-folder', '--folder', type=str, help='directory that holds all the mendelian info', required=True)
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True)


args = parser.parse_args()
folder = args.folder
vcf = args.vcf

# input files
folder = folder + '/*.txt'
filenames = glob(folder)
dataframes = []
for filename in filenames:
dataframes.append(pd.read_table(filename,header=None))

dfs = [df.set_index([0, 1]) for df in dataframes]
merged_mendelian = pd.concat(dfs, axis=1).reset_index()
family_name = [i.split('/')[-1].replace('.txt','') for i in filenames]
columns = ['CHROM','POS'] + family_name
merged_mendelian.columns = columns

vcf_dat = pd.read_table(vcf)

merged_df = pd.merge(merged_mendelian, vcf_dat, how='outer', left_on=['CHROM','POS'], right_on = ['#CHROM','POS'])
merged_df = merged_df.fillna('nan')

vcf_header = '''##fileformat=VCFv4.2
##fileDate=20200501
##source=high_confidence_calls_intergration(choppy app)
##reference=GRCh38.d1.vd1
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples">
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
'''
# output files
benchmark_LCL5 = open('LCL5_voted.vcf','w')
benchmark_LCL6 = open('LCL6_voted.vcf','w')
benchmark_LCL7 = open('LCL7_voted.vcf','w')
benchmark_LCL8 = open('LCL8_voted.vcf','w')

all_sample_outfile = open('all_sample_information.txt','w')

# write VCF
LCL5_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_benchmark_calls\n'
LCL6_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6_benchmark_calls\n'
LCL7_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7_benchmark_calls\n'
LCL8_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8_benchmark_calls\n'

benchmark_LCL5.write(vcf_header)
benchmark_LCL5.write(LCL5_col)
benchmark_LCL6.write(vcf_header)
benchmark_LCL6.write(LCL6_col)
benchmark_LCL7.write(vcf_header)
benchmark_LCL7.write(LCL7_col)
benchmark_LCL8.write(vcf_header)
benchmark_LCL8.write(LCL8_col)

# all info
all_info_col = 'CHROM\tPOS\tLCL5_pcr_consensus\tLCL5_pcr_free_consensus\tLCL5_mendelian_num\tLCL5_consensus_call\tLCL5_consensus_alt_seq\tLCL5_alt\tLCL5_dp\tLCL5_detected_num\tLCL6_pcr_consensus\tLCL6_pcr_free_consensus\tLCL6_mendelian_num\tLCL6_consensus_call\tLCL6_consensus_alt_seq\tLCL6_alt\tLCL6_dp\tLCL6_detected_num\tLCL7_pcr_consensus\tLCL7_pcr_free_consensus\tLCL7_mendelian_num\t LCL7_consensus_call\tLCL7_consensus_alt_seq\tLCL7_alt\tLCL7_dp\tLCL7_detected_num\tLCL8_pcr_consensus\tLCL8_pcr_free_consensus\tLCL8_mendelian_num\tLCL8_consensus_call\tLCL8_consensus_alt_seq\tLCL8_alt\tLCL8_dp\tLCL8_detected_num\n'
all_sample_outfile.write(all_info_col)

# function
def decide_by_rep(vcf_list,mendelian_list):
consensus_rep = ''
gt = [x.split(':')[0] for x in vcf_list]
# mendelian consistent?
mendelian_dict = Counter(mendelian_list)
highest_mendelian = mendelian_dict.most_common(1)
candidate_mendelian = highest_mendelian[0][0]
freq_mendelian = highest_mendelian[0][1]
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
con_loc = [i for i in range(len(mendelian_list)) if mendelian_list[i] == '1:1:1']
gt_con = itemgetter(*con_loc)(gt)
gt_num_dict = Counter(gt_con)
highest_gt = gt_num_dict.most_common(1)
candidate_gt = highest_gt[0][0]
freq_gt = highest_gt[0][1]
if (candidate_gt != './.') and (freq_gt >= 2):
consensus_rep = candidate_gt
elif (candidate_gt == './.') and (freq_gt >= 2):
consensus_rep = 'noGTInfo'
else:
consensus_rep = 'inconGT'
elif (candidate_mendelian == 'nan') and (freq_mendelian >= 2):
consensus_rep = 'noMenInfo'
else:
consensus_rep = 'inconMen'
return consensus_rep

def consensus_call(vcf_info_list,mendelian_list,alt_seq):
pcr_consensus = '.'
pcr_free_consensus = '.'
mendelian_num = '.'
consensus_call = '.'
consensus_alt_seq = '.'
# pcr
SEQ2000 = decide_by_rep(vcf_info_list[0:3],mendelian_list[0:3])
XTen_ARD = decide_by_rep(vcf_info_list[18:21],mendelian_list[18:21])
XTen_NVG = decide_by_rep(vcf_info_list[21:24],mendelian_list[21:24])
XTen_WUX = decide_by_rep(vcf_info_list[24:27],mendelian_list[24:27])
pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
pcr_sequence_dict = Counter(pcr_sequence_site)
pcr_highest_sequence = pcr_sequence_dict.most_common(1)
pcr_candidate_sequence = pcr_highest_sequence[0][0]
pcr_freq_sequence = pcr_highest_sequence[0][1]
if pcr_freq_sequence > 2:
pcr_consensus = pcr_candidate_sequence
else:
pcr_consensus = 'inconSequenceSite'
# pcr-free
T7_WGE = decide_by_rep(vcf_info_list[3:6],mendelian_list[3:6])
Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9],mendelian_list[6:9])
Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12],mendelian_list[9:12])
Nova_BRG = decide_by_rep(vcf_info_list[12:15],mendelian_list[12:15])
Nova_WUX = decide_by_rep(vcf_info_list[15:18],mendelian_list[15:18])
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
sequence_dict = Counter(sequence_site)
highest_sequence = sequence_dict.most_common(1)
candidate_sequence = highest_sequence[0][0]
freq_sequence = highest_sequence[0][1]
if freq_sequence > 3:
pcr_free_consensus = candidate_sequence
else:
pcr_free_consensus = 'inconSequenceSite'
# net alt, dp
# alt
AD = [x.split(':')[1] for x in vcf_info_list]
ALT = [x.split(',')[1] for x in AD]
ALT = [int(x) for x in ALT]
ALL_ALT = sum(ALT)
# dp
DP = [x.split(':')[2] for x in vcf_info_list]
DP = [int(x) for x in DP]
ALL_DP = sum(DP)
# detected number
gt = [x.split(':')[0] for x in vcf_info_list]
gt = [x.replace('0/0','.') for x in gt]
gt = [x.replace('./.','.') for x in gt]
detected_num = 27 - gt.count('.')
# decide consensus calls
tag = ['inconGT','noMenInfo','inconMen','inconSequenceSite','noGTInfo']
if (pcr_consensus != '0/0') and (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag):
consensus_call = pcr_consensus
gt = [x.split(':')[0] for x in vcf_info_list]
indices = [i for i, x in enumerate(gt) if x == consensus_call]
matched_mendelian = itemgetter(*indices)(mendelian_list)
mendelian_num = matched_mendelian.count('1:1:1')
# Delete multiple alternative genotype to necessary expression
alt_gt = alt_seq.split(',')
if len(alt_gt) > 1:
allele1 = consensus_call.split('/')[0]
allele2 = consensus_call.split('/')[1]
if allele1 == '0':
allele2_seq = alt_gt[int(allele2) - 1]
consensus_alt_seq = allele2_seq
consensus_call = '0/1'
else:
allele1_seq = alt_gt[int(allele1) - 1]
allele2_seq = alt_gt[int(allele2) - 1]
if int(allele1) > int(allele2):
consensus_alt_seq = allele2_seq + ',' + allele1_seq
consensus_call = '1/2'
elif int(allele1) < int(allele2):
consensus_alt_seq = allele1_seq + ',' + allele2_seq
consensus_call = '1/2'
else:
consensus_alt_seq = allele1_seq
consensus_call = '1/1'
else:
consensus_alt_seq = alt_seq
elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
consensus_call = 'filtered'
elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')):
consensus_call = 'pcr-free-speicifc'
elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')):
consensus_call = 'pcr-speicifc'
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
consensus_call = '0/0'
else:
consensus_call = 'filtered'
return pcr_consensus, pcr_free_consensus, mendelian_num, consensus_call, consensus_alt_seq, ALL_ALT, ALL_DP, detected_num


for row in merged_df.itertuples():
mendelian_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_1_20191105,row.Quartet_DNA_BGI_T7_WGE_2_20191105,row.Quartet_DNA_BGI_T7_WGE_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_3_20170216]
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216]
lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216]
lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \
row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \
row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \
row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \
row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \
row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \
row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \
row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \
row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216]
# LCL5
LCL5_pcr_consensus, LCL5_pcr_free_consensus, LCL5_mendelian_num, LCL5_consensus_call, LCL5_consensus_alt_seq, LCL5_alt, LCL5_dp, LCL5_detected_num = consensus_call(lcl5_list,mendelian_list,row.ALT)
if LCL5_mendelian_num != '.':
LCL5_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL5_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL5_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL5_consensus_call + ':' + str(LCL5_alt) + ':' + str(LCL5_dp) + '\n'
benchmark_LCL5.write(LCL5_output)
# LCL6
LCL6_pcr_consensus, LCL6_pcr_free_consensus, LCL6_mendelian_num, LCL6_consensus_call, LCL6_consensus_alt_seq, LCL6_alt, LCL6_dp, LCL6_detected_num = consensus_call(lcl6_list,mendelian_list,row.ALT)
if LCL6_mendelian_num != '.':
LCL6_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL6_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL6_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL6_consensus_call + ':' + str(LCL6_alt) + ':' + str(LCL6_dp) + '\n'
benchmark_LCL6.write(LCL6_output)
# LCL7
LCL7_pcr_consensus, LCL7_pcr_free_consensus, LCL7_mendelian_num, LCL7_consensus_call, LCL7_consensus_alt_seq, LCL7_alt, LCL7_dp, LCL7_detected_num = consensus_call(lcl7_list,mendelian_list,row.ALT)
if LCL7_mendelian_num != '.':
LCL7_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL7_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL7_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL7_consensus_call + ':' + str(LCL7_alt) + ':' + str(LCL7_dp) + '\n'
benchmark_LCL7.write(LCL7_output)
# LCL8
LCL8_pcr_consensus, LCL8_pcr_free_consensus, LCL8_mendelian_num, LCL8_consensus_call, LCL8_consensus_alt_seq, LCL8_alt, LCL8_dp, LCL8_detected_num = consensus_call(lcl8_list,mendelian_list,row.ALT)
if LCL8_mendelian_num != '.':
LCL8_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL8_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL8_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL8_consensus_call + ':' + str(LCL8_alt) + ':' + str(LCL8_dp) + '\n'
benchmark_LCL8.write(LCL8_output)
# all data
all_output = row.CHROM + '\t' + str(row.POS) + '\t' + LCL5_pcr_consensus + '\t' + LCL5_pcr_free_consensus + '\t' + str(LCL5_mendelian_num) + '\t' + LCL5_consensus_call + '\t' + LCL5_consensus_alt_seq + '\t' + str(LCL5_alt) + '\t' + str(LCL5_dp) + '\t' + str(LCL5_detected_num) + '\t' +\
LCL6_pcr_consensus + '\t' + LCL6_pcr_free_consensus + '\t' + str(LCL6_mendelian_num) + '\t' + LCL6_consensus_call + '\t' + LCL6_consensus_alt_seq + '\t' + str(LCL6_alt) + '\t' + str(LCL6_dp) + '\t' + str(LCL6_detected_num) + '\t' +\
LCL7_pcr_consensus + '\t' + LCL7_pcr_free_consensus + '\t' + str(LCL7_mendelian_num) + '\t' + LCL7_consensus_call + '\t' + LCL7_consensus_alt_seq + '\t' + str(LCL7_alt) + '\t' + str(LCL7_dp) + '\t' + str(LCL7_detected_num) + '\t' +\
LCL8_pcr_consensus + '\t' + LCL8_pcr_free_consensus + '\t' + str(LCL8_mendelian_num) + '\t' + LCL8_consensus_call + '\t' + LCL8_consensus_alt_seq + '\t' + str(LCL8_alt) + '\t' + str(LCL8_dp) + '\t' + str(LCL8_detected_num) + '\n'
all_sample_outfile.write(all_output)


+ 19
- 0
inputs Vedi File

@@ -0,0 +1,19 @@
{
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript_v3.0/reference_datasets_v202103/",
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
"{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest",
"{{ project_name }}.LCL8": "{{ LCL8 }}",
"{{ project_name }}.disk_size": "500",
"{{ project_name }}.LCL5": "{{ LCL5 }}",
"{{ project_name }}.project": "{{ project }}",
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc",
"{{ project_name }}.bed": "oss://pgx-result/renluyao/manuscript_v3.0/reference_datasets_v202105/benchmark_regions.filtered.bed",
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.ps.g.2xlarge img-ubuntu-vpc",
"{{ project_name }}.LCL6": "{{ LCL6 }}",
"{{ project_name }}.sdf": "oss://pgx-reference-data/GRCh38.d1.vd1/GRCh38.d1.vd1.sdf/",
"{{ project_name }}.contig": "oss://pgx-result/renluyao/NCTR/contig.txt",
"{{ project_name }}.MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
"{{ project_name }}.LCL7": "{{ LCL7 }}",
"{{ project_name }}.DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4",
"{{ project_name }}.ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/"
}

+ 4
- 0
rename.txt Vedi File

@@ -0,0 +1,4 @@
LCL5_10X_1 LCL5
LCL6_10X LCL6
LCL7_10X LCL7
LCL8_10X LCL8

+ 15
- 0
samples Vedi File

@@ -0,0 +1,15 @@
gvcf_list,project,sample_id,gvcf_idx_list
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-0/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-1/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-10/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-11/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-2/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-3/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-4/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-5/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-6/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-7/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-8/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-9/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518_hc.g.vcf,Quartet_DNA_BGI_SEQ2000_BGI_20180530,1,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-0/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-1/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-10/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-11/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-2/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-3/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-4/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-5/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-6/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-7/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-8/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-9/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518_hc.g.vcf.idx
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-72/Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-73/Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-74/Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-75/Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-76/Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-77/Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-78/Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-79/Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-80/Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-81/Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-82/Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-83/Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105_hc.g.vcf,Quartet_DNA_BGI_T7_WGE_20191105,2,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-72/Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-73/Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-74/Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-75/Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-76/Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-77/Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-78/Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-79/Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-80/Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-81/Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-82/Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-83/Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105_hc.g.vcf.idx
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-48/Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-49/Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-50/Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-54/Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-55/Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-56/Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-60/Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-61/Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-62/Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-66/Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-67/Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-68/Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108_hc.g.vcf,Quartet_DNA_ILM_Nova_ARD_20181108,3,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-48/Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-49/Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-50/Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-54/Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-55/Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-56/Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-60/Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-61/Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-62/Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-66/Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-67/Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-68/Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108_hc.g.vcf.idx
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-51/Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-52/Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-53/Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-57/Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-58/Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-59/Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-63/Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-64/Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-65/Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-69/Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-70/Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-71/Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111_hc.g.vcf,Quartet_DNA_ILM_Nova_ARD_20190111,4,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-51/Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-52/Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-53/Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-57/Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-58/Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-59/Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-63/Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-64/Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-65/Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-69/Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-70/Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-71/Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111_hc.g.vcf.idx
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-96/Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-97/Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-98/Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-99/Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-100/Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-101/Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-102/Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-103/Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-104/Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-105/Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-106/Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-107/Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930_hc.g.vcf,Quartet_DNA_ILM_Nova_BRG_20180930,5,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-96/Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-97/Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-98/Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-99/Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-100/Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-101/Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-102/Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-103/Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-104/Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-105/Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-106/Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-107/Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930_hc.g.vcf.idx
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-84/Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-91/Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-95/Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-85/Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-89/Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-94/Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-86/Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-90/Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-93/Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-87/Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-88/Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-92/Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917_hc.g.vcf,Quartet_DNA_ILM_Nova_WUX_20190917,6,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-84/Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-91/Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-95/Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-85/Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-89/Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-94/Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-86/Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-90/Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-93/Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-87/Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-88/Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-92/Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917_hc.g.vcf.idx
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-12/Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-13/Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-14/Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-15/Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-16/Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-17/Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-18/Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-19/Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-20/Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-21/Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-22/Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-23/Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403_hc.g.vcf,Quartet_DNA_ILM_XTen_ARD_20170403,7,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-12/Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-13/Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-14/Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-15/Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-16/Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-17/Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-18/Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-19/Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-20/Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-21/Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-22/Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-23/Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403_hc.g.vcf.idx
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-24/Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-25/Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-26/Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-27/Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-28/Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-29/Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-30/Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-31/Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-32/Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-33/Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-34/Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-35/Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329_hc.g.vcf,Quartet_DNA_ILM_XTen_NVG_20170329,8,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-24/Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-25/Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-26/Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-27/Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-28/Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-29/Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-30/Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-31/Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-32/Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-33/Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-34/Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-35/Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329_hc.g.vcf.idx
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-36/Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-37/Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-38/Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-39/Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-40/Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-41/Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-42/Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-43/Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-44/Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-45/Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-46/Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-47/Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216_hc.g.vcf,Quartet_DNA_ILM_XTen_WUX_20170216,9,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-36/Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-37/Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-38/Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-39/Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-40/Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-41/Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-42/Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-43/Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-44/Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-45/Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-46/Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-47/Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216_hc.g.vcf.idx






BIN
tasks/.DS_Store Vedi File


+ 48
- 0
tasks/BQSR.wdl Vedi File

@@ -0,0 +1,48 @@
task BQSR {
File ref_dir
File dbsnp_dir
File dbmills_dir
String sample
String SENTIEON_INSTALL_DIR
String fasta
String dbsnp
String db_mills
File realigned_bam
File realigned_bam_index
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)

${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${realigned_bam} --algo QualCal -k ${dbsnp_dir}/${dbsnp} -k ${dbmills_dir}/${db_mills} ${sample}_recal_data.table

${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${realigned_bam} -q ${sample}_recal_data.table --algo QualCal -k ${dbsnp_dir}/${dbsnp} -k ${dbmills_dir}/${db_mills} ${sample}_recal_data.table.post --algo ReadWriter ${sample}.sorted.deduped.realigned.recaled.bam

${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt --algo QualCal --plot --before ${sample}_recal_data.table --after ${sample}_recal_data.table.post ${sample}_recal_data.csv

${SENTIEON_INSTALL_DIR}/bin/sentieon plot QualCal -o ${sample}_bqsrreport.pdf ${sample}_recal_data.csv

>>>
runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File recal_table = "${sample}_recal_data.table"
File recal_post = "${sample}_recal_data.table.post"
File recaled_bam = "${sample}.sorted.deduped.realigned.recaled.bam"
File recaled_bam_index = "${sample}.sorted.deduped.realigned.recaled.bam.bai"
File recal_csv = "${sample}_recal_data.csv"
File bqsrreport_pdf = "${sample}_bqsrreport.pdf"
}
}

+ 49
- 0
tasks/D5_D6.wdl Vedi File

@@ -0,0 +1,49 @@
task D5_D6 {
Array[File] splited_vcf
String project
String docker
String cluster_config
String disk_size
command <<<

mkdir -p /cromwell_root/tmp/vcf

cp ${sep=" " splited_vcf} /cromwell_root/tmp/vcf

for i in /cromwell_root/tmp/vcf/*vcf
do
for j in /cromwell_root/tmp/vcf/*vcf
do
sample_i=$(echo $i | cut -f7 -d_)
sample_j=$(echo $j | cut -f7 -d_)
rep_i=$(echo $i | cut -f8 -d_)
rep_j=$(echo $j | cut -f8 -d_)
if [[ $sample_i == "LCL5" ] && [ $sample_j == "LCL6" ] && [ $rep_i == $rep_j ]];then
cat $i | grep -v '##' | cut -f1,2,4,5,10 | cut -d ':' -f1 > LCL5.txt
cat $j | grep -v '##' | cut -f10 | cut -d ':' -f1 > LCL6.txt
paste LCL5.txt LCL6.txt > sister.txt
python /opt/D5_D6.py -sister sister.txt -project ${project}.$rep_i
fi
done
done

for i in *.reproducibility.txt
do
cat $i | sed -n '2,2p' >> sister.summary
done
sed '1i\Family\tReproducibility_D5_D6' sister.summary > ${project}.sister.txt

>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
Array[File] sister_file = glob("*.reproducibility.txt")
File sister_summary = "${project}.sister.txt"
}
}

+ 40
- 0
tasks/Dedup.wdl Vedi File

@@ -0,0 +1,40 @@
task Dedup {

String SENTIEON_INSTALL_DIR
String sample

File sorted_bam
File sorted_bam_index
String docker
String cluster_config
String disk_size


command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt -i ${sorted_bam} --algo LocusCollector --fun score_info ${sample}_score.txt
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt -i ${sorted_bam} --algo Dedup --rmdup --score_info ${sample}_score.txt --metrics ${sample}_dedup_metrics.txt ${sample}.sorted.deduped.bam
>>>
runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File score = "${sample}_score.txt"
File dedup_metrics = "${sample}_dedup_metrics.txt"
File Dedup_bam = "${sample}.sorted.deduped.bam"
File Dedup_bam_index = "${sample}.sorted.deduped.bam.bai"
}
}







+ 34
- 0
tasks/GVCFtyper.wdl Vedi File

@@ -0,0 +1,34 @@
task GVCFtyper {
File ref_dir
String SENTIEON_INSTALL_DIR
String fasta
Array[File] vcf
Array[File] vcf_idx
String project
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} --algo GVCFtyper ${project}.joint.g.vcf ${sep=" " vcf}
>>>
runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File merged_gvcf = "${project}.joint.g.vcf"
File merged_gvcf_idx = "${project}.joint.g.vcf.idx"
}
}



+ 34
- 0
tasks/Haplotyper_gVCF.wdl Vedi File

@@ -0,0 +1,34 @@
task Haplotyper_gVCF {
File ref_dir
String SENTIEON_INSTALL_DIR
String fasta
File recaled_bam
File recaled_bam_index
String sample
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${recaled_bam} --algo Haplotyper --emit_mode gvcf ${sample}_hc.g.vcf
>>>
runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File vcf = "${sample}_hc.g.vcf"
File vcf_idx = "${sample}_hc.g.vcf.idx"
}
}



+ 53
- 0
tasks/Metrics.wdl Vedi File

@@ -0,0 +1,53 @@
task Metrics {

File ref_dir
String SENTIEON_INSTALL_DIR
String sample
String docker
String cluster_config

String fasta
File sorted_bam
File sorted_bam_index
String disk_size

command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${sorted_bam} --algo MeanQualityByCycle ${sample}_mq_metrics.txt --algo QualDistribution ${sample}_qd_metrics.txt --algo GCBias --summary ${sample}_gc_summary.txt ${sample}_gc_metrics.txt --algo AlignmentStat ${sample}_aln_metrics.txt --algo InsertSizeMetricAlgo ${sample}_is_metrics.txt --algo CoverageMetrics --omit_base_output ${sample}_coverage_metrics

${SENTIEON_INSTALL_DIR}/bin/sentieon plot metrics -o ${sample}_metrics_report.pdf gc=${sample}_gc_metrics.txt qd=${sample}_qd_metrics.txt mq=${sample}_mq_metrics.txt isize=${sample}_is_metrics.txt
>>>
runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File qd_metrics = "${sample}_qd_metrics.txt"
File qd_metrics_pdf = "${sample}_qd_metrics.pdf"
File mq_metrics = "${sample}_mq_metrics.txt"
File mq_metrics_pdf = "${sample}_mq_metrics.pdf"
File is_metrics = "${sample}_is_metrics.txt"
File is_metrics_pdf = "${sample}_is_metrics.pdf"
File gc_summary = "${sample}_gc_summary.txt"
File gc_metrics = "${sample}_gc_metrics.txt"
File gc_metrics_pdf = "${sample}_gc_metrics.pdf"
File aln_metrics = "${sample}_aln_metrics.txt"
File coverage_metrics_sample_summary = "${sample}_coverage_metrics.sample_summary"
File coverage_metrics_sample_statistics = "${sample}_coverage_metrics.sample_statistics"
File coverage_metrics_sample_interval_statistics = "${sample}_coverage_metrics.sample_interval_statistics"
File coverage_metrics_sample_cumulative_coverage_proportions = "${sample}_coverage_metrics.sample_cumulative_coverage_proportions"
File coverage_metrics_sample_cumulative_coverage_counts = "${sample}_coverage_metrics.sample_cumulative_coverage_counts"
}

}






+ 41
- 0
tasks/Realigner.wdl Vedi File

@@ -0,0 +1,41 @@
task Realigner {

File ref_dir
File dbmills_dir

String SENTIEON_INSTALL_DIR
String sample
String fasta

File Dedup_bam
File Dedup_bam_index
String db_mills
String docker
String cluster_config
String disk_size


command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${Dedup_bam} --algo Realigner -k ${dbmills_dir}/${db_mills} ${sample}.sorted.deduped.realigned.bam
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File realigner_bam = "${sample}.sorted.deduped.realigned.bam"
File realigner_bam_index = "${sample}.sorted.deduped.realigned.bam.bai"

}
}



+ 105
- 0
tasks/benchmark.wdl Vedi File

@@ -0,0 +1,105 @@
task benchmark {
File vcf
File benchmarking_dir
File contig
File sdf
String project
String sample = basename(vcf,".raw.vcf.gz")
String docker
String cluster_config
String disk_size


command <<<
set -o pipefail
set -e
nt=$(nproc)
mkdir -p /cromwell_root/tmp
cp -r ${benchmarking_dir} /cromwell_root/tmp/
if [[ ${sample} =~ "gz" ]];then
gunzip ${vcf} -c > unzip.vcf
else
cp ${vcf} unzip.vcf
fi

cat unzip.vcf | grep '#CHROM' > name
cat unzip.vcf | grep -v '#' > body
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered

if grep -q PASS "body.filtered"; then
cat body.filtered | grep -v '0/0' | grep 'PASS' > body.filtered.gt
cat unzip.vcf | grep '##' | grep -v 'contig' | cat - ${contig} name body.filtered.gt > filtered.vcf
else
cat body.filtered | grep -v '0/0' > body.filtered.gt
cat unzip.vcf | grep '##' | grep -v 'contig' | cat - ${contig} name body.filtered.gt > filtered.vcf
fi

echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5" > LCL5_name
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6" > LCL6_name
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7" > LCL7_name
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8" > LCL8_name


/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip filtered.vcf -c > ${project}.${sample}.rtg.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.rtg.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${project}.${sample}.rtg.vcf.gz -o ${sample}.rtg.SNV.vcf.gz --snps-only
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${project}.${sample}.rtg.vcf.gz -o ${sample}.rtg.INDEL.vcf.gz --non-snps-only


if [[ ${sample} =~ "LCL5" ]];then
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
cat filtered.vcf | grep '##' > header
cat filtered.vcf | grep -v '#' > body
cat header LCL5_name body > LCL5.vcf
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL5.vcf -c > ${project}.${sample}.reformed.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.reformed.vcf.gz
elif [[ ${sample} =~ "LCL6" ]]; then
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
cat filtered.vcf | grep '##' > header
cat filtered.vcf | grep -v '#' > body
cat header LCL6_name body > LCL6.vcf
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL6.vcf -c > ${project}.${sample}.reformed.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.reformed.vcf.gz
elif [[ ${sample} =~ "LCL7" ]]; then
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
cat filtered.vcf | grep '##' > header
cat filtered.vcf | grep -v '#' > body
cat header LCL7_name body > LCL7.vcf
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL7.vcf -c > ${project}.${sample}.reformed.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.reformed.vcf.gz
elif [[ ${sample} =~ "LCL8" ]]; then
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed
cat filtered.vcf | grep '##' > header
cat filtered.vcf | grep -v '#' > body
cat header LCL8_name body > LCL8.vcf
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL8.vcf -c > ${project}.${sample}.reformed.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.reformed.vcf.gz
else
echo "only for quartet samples"
fi


cat ${sample}_SNV/summary.txt > ${project}.${sample}_SNV_precision_recall.txt
cat ${sample}_INDEL/summary.txt > ${project}.${sample}_INDEL_precision_recall.txt
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File rtg_vcf = "${project}.${sample}.reformed.vcf.gz"
File rtg_vcf_index = "${project}.${sample}.reformed.vcf.gz.tbi"
File SNV_result = "${project}.${sample}_SNV_precision_recall.txt"
File Indel_result = "${project}.${sample}_INDEL_precision_recall.txt"
}
}

+ 44
- 0
tasks/corealigner.wdl Vedi File

@@ -0,0 +1,44 @@
task corealigner {
File ref_dir
File dbsnp_dir
File dbmills_dir

String sample
String SENTIEON_INSTALL_DIR
String docker
String cluster_config
String fasta

String dbsnp
String db_mills
File tumor_recaled_bam
File tumor_recaled_bam_index
File normal_recaled_bam
File normal_recaled_bam_index
String disk_size


command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${tumor_recaled_bam} -i ${normal_recaled_bam} --algo Realigner -k ${db_mills} -k ${dbsnp} ${sample}_corealigned.bam
>>>
runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File corealigner_bam = "${sample}_corealigned.bam"
File corealigner_bam_index = "${sample}_corealigned.bam.bai"
}
}




+ 37
- 0
tasks/deduped_Metrics.wdl Vedi File

@@ -0,0 +1,37 @@
task deduped_Metrics {

File ref_dir
String SENTIEON_INSTALL_DIR
String sample
String fasta
File Dedup_bam
File Dedup_bam_index
String docker
String cluster_config
String disk_size


command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${Dedup_bam} --algo CoverageMetrics --omit_base_output ${sample}_deduped_coverage_metrics --algo MeanQualityByCycle ${sample}_deduped_mq_metrics.txt --algo QualDistribution ${sample}_deduped_qd_metrics.txt --algo GCBias --summary ${sample}_deduped_gc_summary.txt ${sample}_deduped_gc_metrics.txt --algo AlignmentStat ${sample}_deduped_aln_metrics.txt --algo InsertSizeMetricAlgo ${sample}_deduped_is_metrics.txt
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File deduped_coverage_metrics_sample_summary = "${sample}_deduped_coverage_metrics.sample_summary"
File deduped_coverage_metrics_sample_statistics = "${sample}_deduped_coverage_metrics.sample_statistics"
File deduped_coverage_metrics_sample_interval_statistics = "${sample}_deduped_coverage_metrics.sample_interval_statistics"
File deduped_coverage_metrics_sample_cumulative_coverage_proportions = "${sample}_deduped_coverage_metrics.sample_cumulative_coverage_proportions"
File deduped_coverage_metrics_sample_cumulative_coverage_counts = "${sample}_deduped_coverage_metrics.sample_cumulative_coverage_counts"
}
}

+ 29
- 0
tasks/extract_multiqc.wdl Vedi File

@@ -0,0 +1,29 @@
task extract_multiqc {

File fastqc_qualimap
File fastqc
File fastqscreen
File hap

String docker
String cluster_config
String disk_size

command <<<
python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc_qualimap} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap}
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File fastqc_result = "fastqc.final.result.txt"
File fastqscreen_result = "fastqscreen.final.result.txt"
File qualimap_result = "qualimap.final.result.txt"
File hap_result = "benchmark.final.result.txt"
}
}

+ 28
- 0
tasks/fastqc.wdl Vedi File

@@ -0,0 +1,28 @@
task fastqc {
File read1
File read2
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
nt=$(nproc)
fastqc -t $nt -o ./ ${read1}
fastqc -t $nt -o ./ ${read2}
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File read1_html = sub(basename(read1), "\\.(fastq|fq)\\.gz$", "_fastqc.html")
File read1_zip = sub(basename(read1), "\\.(fastq|fq)\\.gz$", "_fastqc.zip")
File read2_html = sub(basename(read2), "\\.(fastq|fq)\\.gz$", "_fastqc.html")
File read2_zip = sub(basename(read2), "\\.(fastq|fq)\\.gz$", "_fastqc.zip")
}
}

+ 36
- 0
tasks/fastqscreen.wdl Vedi File

@@ -0,0 +1,36 @@
task fastq_screen {
File read1
File read2
File screen_ref_dir
File fastq_screen_conf
String read1name = basename(read1,".fastq.gz")
String read2name = basename(read2,".fastq.gz")
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
nt=$(nproc)
mkdir -p /cromwell_root/tmp
cp -r ${screen_ref_dir} /cromwell_root/tmp/
fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${read1}
fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${read2}
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File png1 = "${read1name}_screen.png"
File txt1 = "${read1name}_screen.txt"
File html1 = "${read1name}_screen.html"
File png2 = "${read2name}_screen.png"
File txt2 = "${read2name}_screen.txt"
File html2 = "${read2name}_screen.html"
}
}

+ 34
- 0
tasks/mapping.wdl Vedi File

@@ -0,0 +1,34 @@
task mapping {

File ref_dir
String fasta
File fastq_1
File fastq_2

String SENTIEON_INSTALL_DIR
String group
String sample
String pl
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=192.168.0.55:8990
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/bwa mem -M -R "@RG\tID:${group}\tSM:${sample}\tPL:${pl}" -t $nt -K 10000000 ${ref_dir}/${fasta} ${fastq_1} ${fastq_2} | ${SENTIEON_INSTALL_DIR}/bin/sentieon util sort -o ${sample}.sorted.bam -t $nt --sam2bam -i -
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File sorted_bam = "${sample}.sorted.bam"
File sorted_bam_index = "${sample}.sorted.bam.bai"
}
}

+ 79
- 0
tasks/mendelian.wdl Vedi File

@@ -0,0 +1,79 @@
task mendelian {
File family_vcf
File family_vcf_inside
File family_vcf_outside
File ref_dir
String family_name = basename(family_vcf,".family.vcf")
String fasta
String docker
String cluster_config
String disk_size
command <<<
export LD_LIBRARY_PATH=/opt/htslib-1.9
nt=$(nproc)

echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped

mkdir VBT_D5
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt

cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf

mkdir VBT_D5_inside
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf_inside} -father ${family_vcf_inside} -child ${family_vcf_inside} -pedigree ${family_name}.D5.ped -outDir VBT_D5_inside -out-prefix ${family_name}.D5.inside --output-violation-regions -thread-count $nt

cat VBT_D5_inside/${family_name}.D5.inside_trio.vcf > ${family_name}.D5.inside.vcf


mkdir VBT_D5_outside
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf_outside} -father ${family_vcf_outside} -child ${family_vcf_outside} -pedigree ${family_name}.D5.ped -outDir VBT_D5_outside -out-prefix ${family_name}.D5.outside --output-violation-regions -thread-count $nt

cat VBT_D5_inside/${family_name}.D5.outside_trio.vcf > ${family_name}.D5.outside.vcf


echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped

mkdir VBT_D6
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt

cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf

mkdir VBT_D6_inside
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf_inside} -father ${family_vcf_inside} -child ${family_vcf_inside} -pedigree ${family_name}.D6.ped -outDir VBT_D6_inside -out-prefix ${family_name}.D6.inside --output-violation-regions -thread-count $nt

cat VBT_D6_inside/${family_name}.D6.inside_trio.vcf > ${family_name}.D6.inside.vcf


mkdir VBT_D6_outside
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf_outside} -father ${family_vcf_outside} -child ${family_vcf_outside} -pedigree ${family_name}.D6.ped -outDir VBT_D6_outside -out-prefix ${family_name}.D6.outside --output-violation-regions -thread-count $nt

cat VBT_D6_inside/${family_name}.D6.outside_trio.vcf > ${family_name}.D6.outside.vcf
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File D5_ped = "${family_name}.D5.ped"
File D6_ped = "${family_name}.D6.ped"
Array[File] D5_mendelian = glob("VBT_D5/*")
Array[File] D6_mendelian = glob("VBT_D6/*")
File D5_trio_vcf = "${family_name}.D5.vcf"
File D6_trio_vcf = "${family_name}.D6.vcf"
Array[File] D5_mendelian_inside = glob("VBT_D5_inside/*")
Array[File] D6_mendelian_inside = glob("VBT_D6_inside/*")
File D5_trio_vcf_inside = "${family_name}.D5.inside.vcf"
File D6_trio_vcf_inside = "${family_name}.D6.inside.vcf"
Array[File] D5_mendelian_outside = glob("VBT_D5_outside/*")
Array[File] D6_mendelian_outside = glob("VBT_D6_outside/*")
File D5_trio_vcf_outside = "${family_name}.D5.outside.vcf"
File D6_trio_vcf_outside = "${family_name}.D6.outside.vcf"
}
}




+ 26
- 0
tasks/mergeNum.wdl Vedi File

@@ -0,0 +1,26 @@
task mergeNum {
Array[File] vcfnumber
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
for i in ${sep=" " vcfnumber}
do
cat $i | cut -d':' -f2 | tr '\n' '\t' | sed s'/\t$/\n/g' >> vcfstats
done
sed '1i\File\tFailed Filters\tPassed Filters\tSNPs\tMNPs\tInsertions\tDeletions\tIndels\tSame as reference\tSNP Transitions/Transversions\tTotal Het/Hom ratio\tSNP Het/Hom ratio\tMNP Het/Hom ratio\tInsertion Het/Hom ratio\tDeletion Het/Hom ratio\tIndel Het/Hom ratio\tInsertion/Deletion ratio\tIndel/SNP+MNP ratio' vcfstats > vcfstats.txt
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File vcfstat="vcfstats.txt"
}
}

+ 40
- 0
tasks/mergeSentieon.wdl Vedi File

@@ -0,0 +1,40 @@
task mergeSentieon {
Array[File] aln_metrics_header
Array[File] aln_metrics_data

Array[File] dedup_metrics_header
Array[File] dedup_metrics_data

Array[File] is_metrics_header
Array[File] is_metrics_data

Array[File] deduped_coverage_header
Array[File] deduped_coverage_data

String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics.txt
cat ${sep=" " dedup_metrics_header} | sed -n '1,1p' | cat - ${sep=" " dedup_metrics_data} > dedup_metrics.txt
cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics.txt
cat ${sep=" " deduped_coverage_header} | sed -n '1,1p' | cat - ${sep=" " deduped_coverage_data} > deduped_coverage.txt
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File aln_metrics_merge = "aln_metrics.txt"
File dedup_metrics_merge = "dedup_metrics.txt"
File is_metrics_merge = "is_metrics.txt"
File deduped_coverage_merge = "deduped_coverage.txt"
}
}

+ 58
- 0
tasks/merge_family.wdl Vedi File

@@ -0,0 +1,58 @@
task merge_family {
File LCL5_vcf_gz
File LCL5_vcf_idx
File LCL6_vcf_gz
File LCL6_vcf_idx
File LCL7_vcf_gz
File LCL7_vcf_idx
File LCL8_vcf_gz
File LCL8_vcf_idx
File bed
String project
String docker
String cluster_config
String disk_size
command <<<

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfmerge --force-merge-all -o ${project}.family.vcf.gz ${LCL5_vcf_gz} ${LCL6_vcf_gz} ${LCL7_vcf_gz} ${LCL8_vcf_gz}

gunzip ${project}.family.vcf.gz


/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL5_vcf_gz} --include-bed=${bed} -o LCL5.inside.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL6_vcf_gz} --include-bed=${bed} -o LCL6.inside.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL7_vcf_gz} --include-bed=${bed} -o LCL7.inside.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL8_vcf_gz} --include-bed=${bed} -o LCL8.inside.vcf.gz

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfmerge --force-merge-all -o ${project}.family.inside.vcf.gz LCL5.inside.vcf.gz LCL6.inside.vcf.gz LCL7.inside.vcf.gz LCL8.inside.vcf.gz

gunzip ${project}.family.inside.vcf.gz

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL5_vcf_gz} --exclude-bed=${bed} -o LCL5.outside.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL6_vcf_gz} --exclude-bed=${bed} -o LCL6.outside.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL7_vcf_gz} --exclude-bed=${bed} -o LCL7.outside.vcf.gz
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL8_vcf_gz} --exclude-bed=${bed} -o LCL8.outside.vcf.gz

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfmerge --force-merge-all -o ${project}.family.outside.vcf.gz LCL5.outside.vcf.gz LCL6.outside.vcf.gz LCL7.outside.vcf.gz LCL8.outside.vcf.gz

gunzip ${project}.family.outside.vcf.gz



>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File merged_vcf = "${project}.family.vcf"
File outside_vcf = "${project}.family.outside.vcf"
File inside_vcf = "${project}.family.inside.vcf"
}
}



+ 72
- 0
tasks/merge_mendelian.wdl Vedi File

@@ -0,0 +1,72 @@
task merge_mendelian {
File D5_trio_vcf
File D6_trio_vcf
File family_vcf
File D5_trio_vcf_inside
File D6_trio_vcf_inside
File family_vcf_inside
File D5_trio_vcf_outside
File D6_trio_vcf_outside
File family_vcf_outside
String family_name = basename(family_vcf,".family.vcf")
String docker
String cluster_config
String disk_size

command <<<
cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt
cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt
cat ${family_vcf} | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[1];
}
}
{ print }
' > ${family_name}.consensus.txt
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name}

cat ${D5_trio_vcf_inside} | grep -v '##' > ${family_name}.D5.inside.txt
cat ${D6_trio_vcf_inside} | grep -v '##' > ${family_name}.D6.inside.txt
cat ${family_vcf_inside} | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[1];
}
}
{ print }
' > ${family_name}.consensus.inside.txt
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.inside.txt -LCL6 ${family_name}.D6.inside.txt -genotype ${family_name}.consensus.inside.txt -family ${family_name}.inside

cat ${D5_trio_vcf_outside} | grep -v '##' > ${family_name}.D5.outside.txt
cat ${D6_trio_vcf_outside} | grep -v '##' > ${family_name}.D6.outside.txt
cat ${family_vcf_outside} | grep -v '##' | awk '
BEGIN { OFS = "\t" }
NF > 2 && FNR > 1 {
for ( i=9; i<=NF; i++ ) {
split($i,a,":") ;$i = a[1];
}
}
{ print }
' > ${family_name}.consensus.outside.txt
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.outside.txt -LCL6 ${family_name}.D6.outside.txt -genotype ${family_name}.consensus.outside.txt -family ${family_name}.outside
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File project_mendelian = "${family_name}.txt"
File project_mendelian_summary = "${family_name}.summary.txt"
File project_mendelian_inside = "${family_name}.inside.txt"
File project_mendelian_summary_inside = "${family_name}.inside.summary.txt"
File project_mendelian_outside = "${family_name}.outside.txt"
File project_mendelian_summary_outside = "${family_name}.outside.summary.txt"
}
}

+ 56
- 0
tasks/multiqc.wdl Vedi File

@@ -0,0 +1,56 @@
task multiqc {

Array[File] read1_zip
Array[File] read2_zip

Array[File] txt1
Array[File] txt2

Array[File] zip
Array[File] summary

String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
mkdir -p /cromwell_root/tmp/fastqc
mkdir -p /cromwell_root/tmp/fastqscreen
mkdir -p /cromwell_root/tmp/bamqc
mkdir -p /cromwell_root/tmp/benchmark

cp ${sep=" " summary} /cromwell_root/tmp/benchmark
cp ${sep=" " read1_zip} ${sep=" " read2_zip} /cromwell_root/tmp/fastqc
cp ${sep=" " txt1} ${sep=" " txt2} /cromwell_root/tmp/fastqscreen
for i in ${sep=" " zip}
do
tar -zxvf $i -C /cromwell_root/tmp/bamqc
done

multiqc /cromwell_root/tmp/

cat multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt
cat multiqc_data/multiqc_fastqc.txt > multiqc_fastqc.txt
cat multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt
cat multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File multiqc_html = "multiqc_report.html"
Array[File] multiqc_txt = glob("multiqc_data/*")
File fastqc_qualimap = "multiqc_general_stats.txt"
File fastqc = "multiqc_fastqc.txt"
File fastqscreen = "multiqc_fastq_screen.txt"
File hap = "multiqc_happy_data.json"
}
}

+ 27
- 0
tasks/qualimap.wdl Vedi File

@@ -0,0 +1,27 @@
task qualimap {
File bam
File bai
String bamname = basename(bam,".bam")
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
nt=$(nproc)
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=60G
tar -zcvf ${bamname}_qualimap.zip ${bamname}
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File zip = "${bamname}_qualimap.zip"
}
}

+ 26
- 0
tasks/quartet_mendelian.wdl Vedi File

@@ -0,0 +1,26 @@
task quartet_mendelian {
Array[File] project_mendelian_summary
String project
String docker
String cluster_config
String disk_size

command <<<
for i in ${sep=" " project_mendelian_summary}
do
cat $i | sed -n '2,3p' >> mendelian.summary
done
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > ${project}.mendelian.txt

>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File mendelian_summary="${project}.mendelian.txt"
}
}

+ 41
- 0
tasks/sentieon.wdl Vedi File

@@ -0,0 +1,41 @@
task sentieon {
File aln_metrics
File dedup_metrics
File is_metrics
File deduped_coverage
String sample_name
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
cat ${aln_metrics} | sed -n '2,2p' > aln_metrics.header
cat ${aln_metrics} | sed -n '5,5p' > ${sample_name}.aln_metrics
cat ${dedup_metrics} | sed -n '2,2p' > dedup_metrics.header
cat ${dedup_metrics} | sed -n '3,3p' > ${sample_name}.dedup_metrics
cat ${is_metrics} | sed -n '2,2p' > is_metrics.header
cat ${is_metrics} | sed -n '3,3p' > ${sample_name}.is_metrics
cat ${deduped_coverage} | sed -n '1,1p' > deduped_coverage.header
cat ${deduped_coverage} | sed -n '2,2p' > ${sample_name}.deduped_coverage
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File aln_metrics_header = "aln_metrics.header"
File aln_metrics_data = "${sample_name}.aln_metrics"
File dedup_metrics_header = "dedup_metrics.header"
File dedup_metrics_data = "${sample_name}.dedup_metrics"
File is_metrics_header = "is_metrics.header"
File is_metrics_data = "${sample_name}.is_metrics"
File deduped_coverage_header = "deduped_coverage.header"
File deduped_coverage_data = "${sample_name}.deduped_coverage"
}
}

+ 50
- 0
tasks/split_gvcf_files.wdl Vedi File

@@ -0,0 +1,50 @@
task split_gvcf_files {
File merged_gvcf
String project
String docker
String cluster_config
String disk_size
command <<<

cat ${merged_gvcf} | grep '#CHROM' | sed s'/\t/\n/g' > name

ncol=`cat name | wc -l`

sed -i '1,9d' name

cat ${merged_gvcf} | grep '#' > header
cat ${merged_gvcf} | grep -v '#' > body
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered
cat header body.filtered > ${project}.filtered.g.vcf

for i in $(seq 10 $ncol); do cat ${project}.filtered.g.vcf | cut -f1-9,$i > $i.splited.vcf; done

ls *splited.vcf | sort -n | paste - name > rename

cat rename | while read a b
do
mv $a $b.splited.vcf
sample=$(echo $b | cut -f6 -d_)
rep=$(echo $b | cut -f7 -d_)
echo $sample >> quartet_sample
echo $rep >> quartet_rep
done

python /opt/how_many_samples.py -sample quartet_sample -rep quartet_rep


>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
Array[File] splited_vcf = glob("*.splited.vcf")
File sister_tag = "sister_tag"
File quartet_tag = "quartet_tag"
}
}

+ 25
- 0
tasks/vcfstat.wdl Vedi File

@@ -0,0 +1,25 @@
task vcfstat {
File rtg_vcf
File rtg_vcf_index
String sample_name
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e

/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfstats ${rtg_vcf} > ${sample_name}.stats.txt
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File vcfnumber="${sample_name}.stats.txt"
}
}

+ 127
- 0
workflow.wdl Vedi File

@@ -0,0 +1,127 @@
import "./tasks/benchmark.wdl" as benchmark
import "./tasks/mendelian.wdl" as mendelian
import "./tasks/merge_mendelian.wdl" as merge_mendelian
import "./tasks/merge_family.wdl" as merge_family


workflow {{ project_name }} {

File LCL5
File LCL6
File LCL7
File LCL8

String BENCHMARKdocker
String MENDELIANdocker
String DIYdocker


String fasta
File ref_dir
File sdf
File contig
File bed

File benchmarking_dir

String project

String disk_size
String BIGcluster_config
String SMALLcluster_config

call benchmark.benchmark as LCL5benchmark {
input:
vcf=LCL5,
benchmarking_dir=benchmarking_dir,
sdf=sdf,
contig=contig,
project=project,
docker=BENCHMARKdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size,
}

call benchmark.benchmark as LCL6benchmark {
input:
vcf=LCL6,
benchmarking_dir=benchmarking_dir,
sdf=sdf,
contig=contig,
project=project,
docker=BENCHMARKdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size,
}

call benchmark.benchmark as LCL7benchmark {
input:
vcf=LCL7,
benchmarking_dir=benchmarking_dir,
sdf=sdf,
contig=contig,
project=project,
docker=BENCHMARKdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size,
}

call benchmark.benchmark as LCL8benchmark {
input:
vcf=LCL8,
benchmarking_dir=benchmarking_dir,
sdf=sdf,
contig=contig,
project=project,
docker=BENCHMARKdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size,
}

call merge_family.merge_family as merge_family {
input:
LCL5_vcf_gz=LCL5benchmark.rtg_vcf,
LCL5_vcf_idx=LCL5benchmark.rtg_vcf_index,
LCL6_vcf_gz=LCL6benchmark.rtg_vcf,
LCL6_vcf_idx=LCL6benchmark.rtg_vcf_index,
LCL7_vcf_gz=LCL7benchmark.rtg_vcf,
LCL7_vcf_idx=LCL7benchmark.rtg_vcf_index,
LCL8_vcf_gz=LCL8benchmark.rtg_vcf,
LCL8_vcf_idx=LCL8benchmark.rtg_vcf,
bed=bed,
project=project,
docker=BENCHMARKdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size,
}

call mendelian.mendelian as mendelian {
input:
family_vcf=merge_family.merged_vcf,
family_vcf_inside=merge_family.inside_vcf,
family_vcf_outside=merge_family.outside_vcf,
ref_dir=ref_dir,
fasta=fasta,
docker=MENDELIANdocker,
cluster_config=BIGcluster_config,
disk_size=disk_size
}

call merge_mendelian.merge_mendelian as merge_mendelian {
input:
D5_trio_vcf=mendelian.D5_trio_vcf,
D6_trio_vcf=mendelian.D6_trio_vcf,
family_vcf=merge_family.merged_vcf,
D5_trio_vcf_inside=mendelian.D5_trio_vcf_inside,
D6_trio_vcf_inside=mendelian.D6_trio_vcf_inside,
family_vcf_inside=merge_family.inside_vcf,
D5_trio_vcf_outside=mendelian.D5_trio_vcf_outside,
D6_trio_vcf_outside=mendelian.D6_trio_vcf_outside,
family_vcf_outside=merge_family.outside_vcf,
docker=DIYdocker,
cluster_config=SMALLcluster_config,
disk_size=disk_size
}
}



Loading…
Annulla
Salva