# March 2016 # April 2016: modified Do.HTD: normalization methods added # July 2016: added PRC computed by precrec package # July 2016: added Do.HTD.holdout #*****************************************************************************************# # libraries and source files to be loaded: library(PerfMeas) ## compute AUROC and precision at fixed recall rate library(precrec) ## compute AUPRC library(preprocessCore) ## Qnorm source("flat.score.norm.R") ## Maxnorm source("graph.utils.R") ## graph utility functions source("F-hier.R") ## compute Kiritchenko-like multi-label F-scores source("Do.flat.normalization.R") ## high level function to compute Maxnorm e Qnorm #*****************************************************************************************# # High level function to compute hierarchical correction according to HTD algorithm. # INPUT: # norm: boolean value: 1.TRUE means that the flat scores matrix has been already normalized in according to a normalization method (def); # 2.FALSE means that the flat scores matrix has NOT been normalized yet. # norm.type: this variable can assume three values: MaxNorm, Qnorm, NONE. We have two case respect to norm: # 1.if norm==FALSE, two kind of normalizations are possible: 1. MaxNorm: each score is divided w.r.t. the max of each class # 2. Qnorm: quantile normalization is applied. PreprocessCore library is used. # 2.if norm==TRUE, set norm.type=="NONE" (def); # flat.file: name of flat scores matrix already normalized or to be normalized in according to norm.type (without rda extension); # ann.file: name of the target labels (without rda extension). It must be an .rda file containing the label matrix of the examples (def: ann.file) # dag.file: name of the graph that represents the hierarchy of the classes (def dag.file) # flat.dir: relative path to folder where flat scores matrix (already normalized or to normalize) is stored (def flat.dir) # ann.dir: relative path to folder where annotation matrix is stored # dag.dir: relative path to folder where graph is stored # flat.norm.dir: 1.if norm=FALSE, relative path where flat normalized scores matrix is strored; # 2.if norm=TRUE, the flat scores matrix is already normalized, than it is set to NULL (def) # n.round: number of rounding digits to be applied to the hierarchical scores matrix (def. 3). # It's used for choosing the best threshold on the basis of the best F.measure (see f.criterion parameter). # f.criterion: character. Type of F-measure to be used to select the best F.measure. There are 2 possibilities: # 1. "F" (default) corresponds to the harmonic mean between the average precision and recall; # 2. "avF" corresponds to the per-example F-score averaged across all the examples. # hierScore.dir: relative path to folder where the matrix with the scores of the classes corrected in according to HTD algorithm is stored # macro.dir: relative path to folder where the class-centric measures (i.e. AUC and PxR across classes) are stored # Fmeas.dir: relative path to folder where example-centric measures (i.e. Precision, Recall, Specificity, F-measure, Accuracy across example) are stored # OUTPUT: # 5 rda files stored in the rispective output directory: # - Matrix with examples on rows and classes on colums representing the hierarchical scores of the classes computed with HTD algorithm. # Stored in hierScore.dir folder # - Example-centric measures computed through find.best.f from F-hier.R file. Stored in macro.dir folder # - AUC (average and per classes) computed through AUC.single.over.classes from package PerfMeas. Stored in macro.dir # - Precision at fixed recall levels (average and per classes) computed through precision.at.multiple.recall.level.over.classes from package PerfMeas. # Stored in macro.dir folder # - PRC (average and per.class) computed by precrec package. Stored in macro.dir Do.HTD <- function ( norm=TRUE, norm.type= "NONE", flat.file=flat.file, ann.file=ann.file, dag.file=dag.file, flat.dir=flat.dir, ann.dir=ann.dir, dag.dir=dag.dir, flat.norm.dir=NULL, n.round=3, f.criterion ="F", hierScore.dir="hierScore.dir/", macro.dir="macro.dir/", Fmeas.dir="Fmeas.dir/" ){ ## Loading Data ############ ## loading hpo dag dag.path <- paste0(dag.dir, dag.file,".rda"); hpo <- get(load(dag.path)); ##root node root <- root.node(hpo); ## loading flat scores matrix relative to a specific subontology flat.path <- paste0(flat.dir, flat.file,".rda"); if(norm==TRUE){ S.flat <- get(load(flat.path)); gc(); ##in order to save ram memory.. ## removing root node from flat norm matrix if it exists if(root %in% colnames(S.flat)){ S.flat <- S.flat[,-which(colnames(S.flat)==root)]; } }else{ Do.FLAT.scores.normalization( norm.type= norm.type, flat.file=flat.file, ann.file=ann.file, dag.file=dag.file, flat.dir=flat.dir, ann.dir=ann.dir, dag.dir=dag.dir, flat.norm.dir=flat.norm.dir ) flat.path <- paste0(flat.norm.dir, norm.type,".",flat.file,".rda"); S.flat <- get(load(flat.path)); } ## loading annotation matrix ann.path <- paste0(ann.dir, ann.file,".rda"); hpo.ann <- get(load(ann.path)); gc(); ## removing root node from annotation table ann.no.root <- target <- hpo.ann[,-which(colnames(hpo.ann)==root)]; ## Computing FLAT Performances ## FLAT AUC computed by PerfMeas package AUC.flat <- AUC.single.over.classes(ann.no.root, S.flat, hpo, root=root)[c("average","per.class")]; ## FLAT PxRs computed by PerfMeas pacakge PXR.flat <- precision.at.multiple.recall.level.over.classes(ann.no.root, S.flat); ## F.measure: Computing Flat Examples-Measures FMM.flat <- find.best.f(ann.no.root, S.flat, n.round=n.round, f.criterion=f.criterion, verbose=FALSE); ## FLAT PRC computed by precrec package (more precise and accurate than PerfMeas) labels <- join_labels(ann.no.root); ##N.B.: if there are some terms with NO annotations, the function return a *stop* mex scores.flat <- join_scores(S.flat); res.flat <- evalmod(scores=scores.flat, labels=labels, dsids=1:ncol(S.flat), modnames=colnames(S.flat)); meas.flat <- auc(res.flat); prc.flat <- subset(meas.flat, curvetypes == "PRC"); prc.mean.flat <- mean(prc.flat$aucs); PRC.class <- prc.flat$aucs; names(PRC.class) <- prc.flat$modnames; PRC.flat <- list(average=prc.mean.flat, per.class=PRC.class); #saving PRC result in the same format of package PerfMeas ## Hierarchical Top Down Correction #################### S.htd <- pred <- htd(S.flat,hpo,root); ## Computing Hier Performances ## Hierarchical AUC (average and per.class) computed by PerfMeas package AUC.htd <- AUC.single.over.classes(ann.no.root, S.htd, hpo, root=root)[c("average","per.class")]; ## Hierarchical PxR at fixed recall levels (average and per.class) computed by PerfMeas package PXR.htd <- precision.at.multiple.recall.level.over.classes(ann.no.root, S.htd); ## Computing Hierarchical Examples-Measures FMM.htd <- find.best.f(target, pred, n.round=n.round, f.criterion =f.criterion, verbose=FALSE); ## Hierarchical PRC (average and per.class) computed by precrec package labels <- join_labels(ann.no.root); ##N.B.: if there are some terms with NO annotations, the function return a *stop* mex scores.htd <- join_scores(S.htd); res.htd <- evalmod(scores=scores.htd, labels=labels, dsids=1:ncol(S.htd), modnames=colnames(S.htd)); meas.htd <- auc(res.htd); prc.htd <- subset(meas.htd, curvetypes == "PRC"); prc.mean.htd <- mean(prc.htd$aucs); PRC.class <- prc.htd$aucs; names(PRC.class) <- prc.htd$modnames; PRC.htd <- list(average=prc.mean.htd, per.class=PRC.class); #saving PRC result in the same format of package PerfMeas ## Storing Results ######### if(norm==TRUE){ save(S.htd, file=paste0(hierScore.dir, flat.file, ".hierScores.htd.rda"), compress=TRUE); save(AUC.flat, AUC.htd, file=paste0(macro.dir, "AUC.", flat.file,".hierScores.htd.rda"), compress=TRUE); save(PXR.flat, PXR.htd, file=paste0(macro.dir, "PXR.", flat.file,".hierScores.htd.rda"), compress=TRUE); save(FMM.flat, FMM.htd, file=paste0(Fmeas.dir, "PCM.", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(PRC.flat, PRC.htd, file=paste0(macro.dir, "PRC.", flat.file,".hierScores.htd.rda"), compress=TRUE); }else{ save(S.htd, file=paste0(hierScore.dir, norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(AUC.flat, AUC.htd, file=paste0(macro.dir, "AUC.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(PXR.flat, PXR.htd, file=paste0(macro.dir, "PXR.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(FMM.flat, FMM.htd, file=paste0(Fmeas.dir,"PCM.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(PRC.flat, PRC.htd, file=paste0(macro.dir, "PRC.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); } } # High level function to correct the scores with a hierarchy according to HTD algorithm performing a classical holdout procedure # All input paramenters are the same of the function above, except the following: # ind.test.set: vector of integer. Indices refer to the examples of the adjancency matrix to be used in the test set. # ind.dir: relative path to folder where ind.test.set is stored Do.HTD.holdout <- function ( norm=TRUE, norm.type= "NONE", flat.file=flat.file, ann.file=ann.file, dag.file=dag.file, ind.test.set=ind.test.set, ind.dir=ind.dir, flat.dir=flat.dir, ann.dir=ann.dir, dag.dir=dag.dir, flat.norm.dir=NULL, n.round=3, f.criterion ="F", hierScore.dir="hierScore.dir/", macro.dir="macro.dir/", Fmeas.dir="Fmeas.dir/" ){ ## Loading Data ############ # loading examples indices of the test set ind.set <- paste0(ind.dir, ind.test.set, ".rda"); ind.test <- get(load(ind.set)); ## loading hpo dag dag.path <- paste0(dag.dir, dag.file,".rda"); hpo <- get(load(dag.path)); ##root node root <- root.node(hpo); ## loading flat scores matrix relative to a specific subontology flat.path <- paste0(flat.dir, flat.file,".rda"); if(norm==TRUE){ S.flat <- get(load(flat.path)); gc(); ##in order to save ram memory.. ## removing root node from flat norm matrix if it exists if(root %in% colnames(S.flat)){ S.flat <- S.flat[,-which(colnames(S.flat)==root)]; } }else{ Do.FLAT.scores.normalization( norm.type= norm.type, flat.file=flat.file, ann.file=ann.file, dag.file=dag.file, flat.dir=flat.dir, ann.dir=ann.dir, dag.dir=dag.dir, flat.norm.dir=flat.norm.dir ) flat.path <- paste0(flat.norm.dir, norm.type,".",flat.file,".rda"); S.flat <- get(load(flat.path)); } ## shrinking the size of S.flat to the examples of test set S.flat <- S.flat[ind.test,]; ## loading annotation matrix ann.path <- paste0(ann.dir, ann.file,".rda"); hpo.ann <- get(load(ann.path)); gc(); ## removing root node from annotation table and shrinking the size of annotation table to the examples of test set ann.no.root <- target <- hpo.ann[ind.test,-which(colnames(hpo.ann)==root)]; ## Computing FLAT Performances ## FLAT AUC computed by PerfMeas package AUC.flat <- AUC.single.over.classes(ann.no.root, S.flat, hpo, root=root)[c("average","per.class")]; ## FLAT PxRs computed by PerfMeas pacakge PXR.flat <- precision.at.multiple.recall.level.over.classes(ann.no.root, S.flat); ## F.measure: Computing Flat Examples-Measures FMM.flat <- find.best.f(ann.no.root, S.flat, n.round=n.round, f.criterion=f.criterion, verbose=FALSE); ## FLAT PRC computed by precrec package (more precise and accurate than PerfMeas) labels <- join_labels(ann.no.root); ##N.B.: if there are some terms with NO annotations, the function return a *stop* mex scores.flat <- join_scores(S.flat); res.flat <- evalmod(scores=scores.flat, labels=labels, dsids=1:ncol(S.flat), modnames=colnames(S.flat)); meas.flat <- auc(res.flat); prc.flat <- subset(meas.flat, curvetypes == "PRC"); prc.mean.flat <- mean(prc.flat$aucs); PRC.class <- prc.flat$aucs; names(PRC.class) <- prc.flat$modnames; PRC.flat <- list(average=prc.mean.flat, per.class=PRC.class); #saving PRC result in the same format of package PerfMeas ## Hierarchical Top Down Correction #################### S.htd <- pred <- htd(S.flat,hpo,root); ## Computing Hier Performances ## Hierarchical AUC (average and per.class) computed by PerfMeas package AUC.htd <- AUC.single.over.classes(ann.no.root, S.htd, hpo, root=root)[c("average","per.class")]; ## Hierarchical PxR at fixed recall levels (average and per.class) computed by PerfMeas package PXR.htd <- precision.at.multiple.recall.level.over.classes(ann.no.root, S.htd); ## Computing Hierarchical Examples-Measures FMM.htd <- find.best.f(target, pred, n.round=n.round, f.criterion =f.criterion, verbose=FALSE); ## Hierarchical PRC (average and per.class) computed by precrec package labels <- join_labels(ann.no.root); ##N.B.: if there are some terms with NO annotations, the function return a *stop* mex scores.htd <- join_scores(S.htd); res.htd <- evalmod(scores=scores.htd, labels=labels, dsids=1:ncol(S.htd), modnames=colnames(S.htd)); meas.htd <- auc(res.htd); prc.htd <- subset(meas.htd, curvetypes == "PRC"); prc.mean.htd <- mean(prc.htd$aucs); PRC.class <- prc.htd$aucs; names(PRC.class) <- prc.htd$modnames; PRC.htd <- list(average=prc.mean.htd, per.class=PRC.class); #saving PRC result in the same format of package PerfMeas ## Storing Results ######### if(norm==TRUE){ save(S.htd, file=paste0(hierScore.dir, flat.file, ".hierScores.htd.rda"), compress=TRUE); save(AUC.flat, AUC.htd, file=paste0(macro.dir, "AUC.", flat.file,".hierScores.htd.rda"), compress=TRUE); save(PXR.flat, PXR.htd, file=paste0(macro.dir, "PXR.", flat.file,".hierScores.htd.rda"), compress=TRUE); save(FMM.flat, FMM.htd, file=paste0(Fmeas.dir, "PCM.", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(PRC.flat, PRC.htd, file=paste0(macro.dir, "PRC.", flat.file,".hierScores.htd.rda"), compress=TRUE); }else{ save(S.htd, file=paste0(hierScore.dir, norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(AUC.flat, AUC.htd, file=paste0(macro.dir, "AUC.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(PXR.flat, PXR.htd, file=paste0(macro.dir, "PXR.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(FMM.flat, FMM.htd, file=paste0(Fmeas.dir,"PCM.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); save(PRC.flat, PRC.htd, file=paste0(macro.dir, "PRC.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE); } }