|
- import pandas as pd
- from functools import reduce
- import sys, argparse, os
- import glob
-
- parser = argparse.ArgumentParser(description="This script is to get information from multiqc and sentieon, output the raw fastq, bam and variants calling (precision and recall) quality metrics")
-
-
- parser.add_argument('-pre', '--pre_alignment_path', type=str, help='pre-alignment directory')
- parser.add_argument('-post', '--post_alignment_path', type=str, help='post-alignment directory')
- parser.add_argument('-vcf', '--benchmark_path', type=str, help='benchmark directory')
- parser.add_argument('-men', '--mendelian_path', type=str, help='mendelian directory')
-
-
- args = parser.parse_args()
-
-
- # Rename input:
- pre_alignment_path = args.pre_alignment_path
- post_alignment_path = args.post_alignment_path
- benchmark_path = args.benchmark_path
- mendelian_path = args.mendelian_path
-
-
- ###pre-alignment
- pre_alignment_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(pre_alignment_path,'*.txt'))))
-
- pre_alignment_df.to_csv('pre-alignment-all.txt',sep="\t",index=0)
-
- ###post-alignment
-
- post_alignment_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(post_alignment_path,'*.txt'))))
-
- post_alignment_df.to_csv('post-alignment-all.txt',sep="\t",index=0)
-
- ###variant-calling qc
-
- ## detail
- variant_calling_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(benchmark_path,'*.txt'))))
-
- variant_calling_df.to_csv('variant-calling-all.txt',sep="\t",index=0)
-
- ## mean + sd
-
- ####snv
- a = variant_calling_df["SNV precision"].mean().round(2)
- b = variant_calling_df["SNV precision"].std().round(2)
-
- c = variant_calling_df["SNV recall"].mean().round(2)
- d = variant_calling_df["SNV precision"].std().round(2)
-
- variant_calling_df["SNV F1 score"] = 2 * variant_calling_df["SNV precision"] * variant_calling_df["SNV recall"] / (variant_calling_df["SNV precision"] + variant_calling_df["SNV recall"])
-
- e = variant_calling_df["SNV F1 score"].mean().round(2)
- f = variant_calling_df["SNV F1 score"].std().round(2)
-
- #### indel
-
- a2 = variant_calling_df["INDEL precision"].mean().round(2)
- b2 = variant_calling_df["INDEL precision"].std().round(2)
-
- c2 = variant_calling_df["INDEL recall"].mean().round(2)
- d2 = variant_calling_df["INDEL precision"].std().round(2)
-
- variant_calling_df["INDEL F1 score"] = 2 * variant_calling_df["INDEL precision"] * variant_calling_df["INDEL recall"] / (variant_calling_df["INDEL precision"] + variant_calling_df["INDEL recall"])
-
- e2 = variant_calling_df["INDEL F1 score"].mean().round(2)
- f2 = variant_calling_df["INDEL F1 score"].std().round(2)
-
- data = {'precision':[str(a),str(a2)], 'precision_sd':[str(b),str(b2)], 'recall':[str(c),str(c2)], 'recall_sd':[str(d),str(d2)], 'F1-score':[str(e),str(e2)], 'F1-score_sd':[str(f),str(f2)] }
-
- df = pd.DataFrame(data, index =['SNV', 'INDEL'])
-
- df.to_csv('benchmark_summary.txt',sep="\t")
-
-
- ### Mendelian
-
- mendelian_df = pd.concat(map(pd.read_table,glob.glob(os.path.join(mendelian_path,'*.txt'))))
-
- mendelian_df.to_csv('mendelian-all.txt',sep="\t",index=0)
-
- ### snv
- snv = mendelian_df[mendelian_df['Family'].str.contains("SNV")]
-
- indel = mendelian_df[mendelian_df['Family'].str.contains("INDEL")]
-
- g = snv["Mendelian_Concordance_Rate"].mean().round(2)
- h = snv["Mendelian_Concordance_Rate"].std().round(2)
-
- g2 = indel["Mendelian_Concordance_Rate"].mean().round(2)
- h2 = indel["Mendelian_Concordance_Rate"].std().round(2)
-
-
- data = {'MCR':[str(g),str(g2)], 'MCR_sd':[str(h),str(h2)]}
-
- df = pd.DataFrame(data, index =['SNV', 'INDEL'])
-
- df.to_csv('mendelian_summary.txt',sep="\t")
-
-
|