Browse Source

extract table

master
LUYAO REN 4 years ago
parent
commit
8682e0ca4c
5 changed files with 65 additions and 23 deletions
  1. +55
    -14
      codescripts/extract_tables.py
  2. +2
    -2
      codescripts/merge_two_family_with_genotype.py
  3. +6
    -5
      tasks/extract_tables.wdl
  4. +0
    -1
      tasks/merge_sentieon_metrics.wdl
  5. +2
    -1
      workflow.wdl

+ 55
- 14
codescripts/extract_tables.py View File

import json import json
import pandas as pd import pandas as pd
from functools import reduce
import sys, argparse, os import sys, argparse, os


parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics") parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True)
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True)


parser.add_argument('-project', '--project_name', type=str, help='project_name', required=True)

args = parser.parse_args() args = parser.parse_args()


# Rename input: # Rename input:
fastqscreen_file = args.fastqscreen fastqscreen_file = args.fastqscreen
hap_file = args.happy hap_file = args.happy


project_name = args.project_name

############################################# #############################################
# fastqc # fastqc
fastqc = pd.read_table(fastqc_file) fastqc = pd.read_table(fastqc_file)




############################ ############################
dat = pd.read_table(aln_metrics_file)
dat = pd.read_table(aln_metrics_file,index_col=False)
dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"] dat['PCT_ALIGNED_READS'] = dat["PF_READS_ALIGNED"]/dat["TOTAL_READS"]
aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]] aln_metrics = dat[["Sample", "PCT_ALIGNED_READS","PF_MISMATCH_RATE"]]
aln_metrics = aln_metrics * 100 aln_metrics = aln_metrics * 100
aln_metrics['Sample'] = [x[-1] for x in aln_metrics['Sample'].str.split('/')]


dat = pd.read_table(is_metrics_file)
dat = pd.read_table(is_metrics_file,index_col=False)
is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']] is_metrics = dat[['Sample', 'MEDIAN_INSERT_SIZE']]
is_metrics['Sample'] = [x[-1] for x in is_metrics['Sample'].str.split('/')]


dat = pd.read_table(quality_yield_file)
dat = pd.read_table(quality_yield_file,index_col=False)
dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES'] dat['%Q20'] = dat['Q20_BASES']/dat['TOTAL_BASES']
dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES'] dat['%Q30'] = dat['Q30_BASES']/dat['TOTAL_BASES']
quality_yield = dat[['Sample','%Q20','%Q30']] quality_yield = dat[['Sample','%Q20','%Q30']]
quality_yield = quality_yield * 100 quality_yield = quality_yield * 100
quality_yield['Sample'] = [x[-1] for x in quality_yield['Sample'].str.split('/')]


dat = pd.read_table(wgs_metrics_file)
dat = pd.read_table(wgs_metrics_file,index_col=False)
wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']] wgs_metrics = dat[['Sample','MEDIAN_COVERAGE','PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']]
wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100 wgs_metrics['PCT_1X'] = wgs_metrics['PCT_1X'] * 100
wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100 wgs_metrics['PCT_5X'] = wgs_metrics['PCT_5X'] * 100
wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100 wgs_metrics['PCT_10X'] = wgs_metrics['PCT_10X'] * 100
wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100 wgs_metrics['PCT_30X'] = wgs_metrics['PCT_30X'] * 100
wgs_metrics['Sample'] = [x[-1] for x in wgs_metrics['Sample'].str.split('/')]


data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics] data_frames = [aln_metrics, is_metrics, quality_yield, wgs_metrics]
post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames) post_alignment_dat = reduce(lambda left,right: pd.merge(left,right,on=['Sample'],how='outer'), data_frames)
post_alignment_dat.columns = ['Sample', '%Mapping', '%Mismatch Rate', 'Mendelian Insert Size','%Q20', '%Q30', 'Median Coverage', 'PCT_1X', 'PCT_5X', 'PCT_10X','PCT_30X']
post_alignment_dat = post_alignment_dat.round(2)
post_alignment_dat.to_csv('post_alignment.txt',sep="\t",index=0)



# benchmark

#########################################
# variants calling
with open(hap_file) as hap_json: with open(hap_file) as hap_json:
happy = json.load(hap_json) happy = json.load(hap_json)
dat =pd.DataFrame.from_records(happy) dat =pd.DataFrame.from_records(happy)
dat = dat.loc[:, dat.columns.str.endswith('ALL')] dat = dat.loc[:, dat.columns.str.endswith('ALL')]
dat_transposed = dat.T dat_transposed = dat.T
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
benchmark.columns = ['Sample','Precision','Recall']

#output
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)

dat_transposed = dat_transposed.loc[:,['sample_id','QUERY.TOTAL','METRIC.Precision','METRIC.Recall']]
indel = dat_transposed[['INDEL' in s for s in dat_transposed.index]]
snv = dat_transposed[['SNP' in s for s in dat_transposed.index]]
indel.reset_index(drop=True, inplace=True)
snv.reset_index(drop=True, inplace=True)
benchmark = pd.concat([snv, indel], axis=1)
benchmark = benchmark[["sample_id", 'QUERY.TOTAL', 'METRIC.Precision', 'METRIC.Recall']]
benchmark.columns = ['Sample','sample_id','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']
benchmark = benchmark[['Sample','SNV number','INDEL number','SNV precision','INDEL precision','SNV recall','INDEL recall']]
benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
benchmark['SNV precision'] = benchmark['SNV precision'] * 100
benchmark['INDEL precision'] = benchmark['INDEL precision'] * 100
benchmark['SNV recall'] = benchmark['SNV recall'] * 100
benchmark['INDEL recall'] = benchmark['INDEL recall']* 100
benchmark = benchmark.round(2)
benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)

all_rep = [x.split('_')[6] for x in benchmark['Sample']]
rep = list(set(all_rep))
columns = ['Family','Average Precision','Average Recall','Precison SD','Recall SD']
df_ = pd.DataFrame(columns=columns)
for i in rep:
string = "_" + i + "_"
sub_dat = benchmark[benchmark['Sample'].str.contains('_1_')]
mean = list(sub_dat.mean(axis = 0, skipna = True))
sd = list(sub_dat.std(axis = 0, skipna = True))
family_name = project_name + "." + i + ".SNV"
df_ = df_.append({'Family': family_name, 'Average Precision': mean[0], 'Average Recall': mean[2], 'Precison SD': sd[0], 'Recall SD': sd[2] }, ignore_index=True)
family_name = project_name + "." + i + ".INDEL"
df_ = df_.append({'Family': family_name, 'Average Precision': mean[1], 'Average Recall': mean[3], 'Precison SD': sd[1], 'Recall SD': sd[3] }, ignore_index=True)
df_ = df_.round(2)
df_.to_csv('precision.recall.txt',sep="\t",index=0)







+ 2
- 2
codescripts/merge_two_family_with_genotype.py View File

snv_quartet = snv_family_mendelian/snv_family_all snv_quartet = snv_family_mendelian/snv_family_all
indel_quartet = indel_family_mendelian/indel_family_all indel_quartet = indel_family_mendelian/indel_family_all
outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n' outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n'
indel_outResult = family + '_INDEL' + '\t' + str(indel_sister) + '\t' + str(indel_quartet) + '\n'
snv_outResult = family + '_SNV' + '\t' + str(snv_sister) + '\t' + str(snv_quartet) + '\n'
indel_outResult = family + '.INDEL' + '\t' + str(indel_sister) + '\t' + str(indel_quartet) + '\n'
snv_outResult = family + '.SNV' + '\t' + str(snv_sister) + '\t' + str(snv_quartet) + '\n'
summary_file.write(outcolumn) summary_file.write(outcolumn)
summary_file.write(indel_outResult) summary_file.write(indel_outResult)
summary_file.write(snv_outResult) summary_file.write(snv_outResult)

+ 6
- 5
tasks/extract_tables.wdl View File

File fastqscreen File fastqscreen
File hap File hap


String project
String docker String docker
String cluster_config String cluster_config
String disk_size String disk_size


command <<< command <<<


python /opt/extract_tables.py -quality ${quality_yield_summary} -depth ${wgs_metrics_summary} -aln ${aln_metrics_summary} -is ${is_metrics_summary} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap} -project ${projecg}


>>> >>>


} }


output { output {
File fastqc_result = "fastqc.final.result.txt"
File fastqscreen_result = "fastqscreen.final.result.txt"
File qualimap_result = "qualimap.final.result.txt"
File hap_result = "benchmark.final.result.txt"
File pre_alignment = "pre_alignment.txt"
File post_alignment = "post_alignment.txt"
File variant_calling = "variants.calling.qc.txt"
File precision_recall = "precision.recall.txt"
} }
} }

+ 0
- 1
tasks/merge_sentieon_metrics.wdl View File

cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics_all cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics_all
ls ${sep=" " aln_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - aln_metrics_all > ${project}.aln_metrics.txt ls ${sep=" " aln_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - aln_metrics_all > ${project}.aln_metrics.txt



cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics_all cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics_all
ls ${sep=" " is_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - is_metrics_all > ${project}.is_metrics.txt ls ${sep=" " is_metrics_data} | cut -d '.' -f1 | cat sample_column - | paste - is_metrics_all > ${project}.is_metrics.txt



+ 2
- 1
workflow.wdl View File

import "./tasks/benchmark.wdl" as benchmark import "./tasks/benchmark.wdl" as benchmark
import "./tasks/multiqc.wdl" as multiqc import "./tasks/multiqc.wdl" as multiqc
import "./tasks/merge_sentieon_metrics.wdl" as merge_sentieon_metrics import "./tasks/merge_sentieon_metrics.wdl" as merge_sentieon_metrics
import "./tasks/extract_tables.wdl" as extract_tables
import "./tasks/mendelian.wdl" as mendelian import "./tasks/mendelian.wdl" as mendelian
import "./tasks/merge_mendelian.wdl" as merge_mendelian import "./tasks/merge_mendelian.wdl" as merge_mendelian
import "./tasks/quartet_mendelian.wdl" as quartet_mendelian import "./tasks/quartet_mendelian.wdl" as quartet_mendelian
import "./tasks/fastqc.wdl" as fastqc import "./tasks/fastqc.wdl" as fastqc
import "./tasks/fastqscreen.wdl" as fastqscreen import "./tasks/fastqscreen.wdl" as fastqscreen
import "./tasks/extract_tables.wdl" as extract_tables
import "./tasks/D5_D6.wdl" as D5_D6 import "./tasks/D5_D6.wdl" as D5_D6
import "./tasks/merge_family.wdl" as merge_family import "./tasks/merge_family.wdl" as merge_family


fastqc=multiqc.fastqc, fastqc=multiqc.fastqc,
fastqscreen=multiqc.fastqscreen, fastqscreen=multiqc.fastqscreen,
hap=multiqc.hap, hap=multiqc.hap,
project=project,
docker=MULTIQCdocker, docker=MULTIQCdocker,
cluster_config=SMALLcluster_config, cluster_config=SMALLcluster_config,
disk_size=disk_size disk_size=disk_size

Loading…
Cancel
Save