########################################################################## # Functions to compute Kiritchenko-like multi-label F-scores # March 2016 ########################################################################## # Generic function for computing, precision, recall, specificity and F-measure for multiclass multilabel classification setGeneric("F.measure.multilabel", function(target, predicted, b.per.example=FALSE) standardGeneric("F.measure.multilabel")); # Method that computes precision, recall, specificity and F-measure for multiclass multilabel classification # Both the target e predicted matrices have a number of rows equal to the number of examples # and a number of columns equal to the number of the classes. # Input: # target : matrix with the target multilabels. 1 entries correspond to positives, 0 to negatives # predicted : matrix with the predicted multilabels. 1 entries correspond to postives, 0 to negatives # b.per.example : if TRUE (def: FALSE) precision recall and F-measure are returned for each example, otherwise only the averages are returned. # Output : # if b.per.example == FALSE the function returns a list with a single element average: a named vector with average precision (P), recall (R), specificity (S) F-measure (F), average F-measure (avF) and accuracy (A) across examples; # otherwise it returns a list with two elements: # average : a named vector with the average precision, recall, specificity F-measure and av.F-measure across examples. # The elements correspond respectively to the average precision, recall, specificity; F-measure is the # F-measure computed as the harmonic mean between the average precision and recall; # av.f.measure is the f.measure computed as the average across examples. # per.example : a named matrix with the precision, recall, specificity and F-measure for each example. # Named rows correspond to examples, # named columns correspond respectively to precision, recall (sensitivity), specificity and F-measure. setMethod("F.measure.multilabel", signature(target="matrix", predicted="matrix"), function(target, predicted, b.per.example=FALSE) { n.examples <- nrow(target); n.classes <- ncol(target); if ((n.examples!=nrow(predicted)) || (n.classes!=ncol(predicted))) stop ("F.measure.multilabel: number of rows or columns do not match between target and predicted classes"); z <- target + predicted; TP <- apply(z, 1, function(x) { return(sum(x==2)); }); TN <- apply(z, 1, function(x) { return(sum(x==0)); }); z <- predicted - target; FP <- apply(z, 1, function(x) { return(sum(x==1)); }); FN <- apply(z, 1, function(x) { return(sum(x== -1)); }); rm(z); n <- sum(TP)+sum(TN)+sum(FN)+sum(FP); if ( n != (n.examples*n.classes)) { cat("n = ", n, "\n n.examples = ", n.examples, "\n n.classes = ", n.classes, "\n"); cat (" sum(TP) = ", sum(TP), "\n sum(TN) = ", sum(TN), "\n sum(FN) = ", sum(FN), "\n sum(FP) = ", sum(FP), "\n"); warning("F.measure.multilabel: Something went wrong in F-measure)"); } P <- TP+FP; P[which(P==0)] <- 1; # to avoid division by 0 in precision sum.TP.FN <- TP+FN; sum.TN.FP <- TN+FP; sum.TP.FN[which(sum.TP.FN==0)] <- 1; # to avoid division by 0 in recall sum.TN.FP[which(sum.TN.FP==0)] <- 1; # to avoid division by 0 in specificity precision <- TP/P; recall <- TP/sum.TP.FN; specificity <- TN/sum.TN.FP; prec.rec <- precision+recall; prec.rec[which(prec.rec==0)] <- 1; # to avoid division by 0 for f.measure f.measure <- (2*precision*recall)/prec.rec; accuracy <- (TP+TN)/n.classes; av.precision <- sum(precision)/n.examples; av.recall <- sum(recall)/n.examples; av.specificity <- sum(specificity)/n.examples; av.prec.rec <- av.precision+av.recall; if (av.prec.rec == 0) av.prec.rec <- 1; overall.av.f.measure <- (2*av.precision*av.recall)/av.prec.rec; av.f.measure <- sum(f.measure)/n.examples; av.accuracy <- sum(accuracy)/n.examples; average <- c(av.precision, av.recall, av.specificity, overall.av.f.measure, av.f.measure,av.accuracy); names(average) <- c("P", "R", "S", "F", "avF", "A"); if (b.per.example) { per.example <- cbind(precision, recall, specificity, f.measure, accuracy); colnames(per.example) <- c("P", "R", "S", "F","A"); return (list(average=average, per.example=per.example)) } else return (list(average=average)); } ) # Function to select the best hierarchical F-score by choosing an appropriate threshold in the scores # N.B. All the examples having no positive annotations are discarded # Arguments: # target : matrix with the target multilabels. 1 stands for positive, 0 for negative # pred : matrix with the predicted scores. Values are assumed to be postive # n.round : number of rounding digits to be applied to pred (default=3) # f.criterion : character. Type of F-measure to be used to select the best F. There are 2 possibilities: # 1. "F" (default) corresponds to the harmonic mean between the average precision and recall; # 2. "avF" corresponds to the per-example F-score averaged across all the examples. # verbose : boolean. If TRUE (def) the number of iterations are printed on stdout # The pred matrix is rounded according to n.round and all the values of pred are divided by max(pred). # Then all the thresholds corresponding to all the different values included in pred are attempted, and the threshold # leading to the maximum f.measure is selected. # b.per.example : if TRUE (def: FALSE) precision, recall, F-measure, specificity and accuracy are returned for each example, # otherwise only the averages are returned. # Output: # if b.per.example == FALSE (def.) the function returns a vector with 7 elements relative to the best result in terms of the f.measure: # precision, recall, specificity, f.measure, av.f.measure, accuracy, thresh, where thresh thresh is the selected best threshold, # av.f.measure is the f.measure averaged across examples and f.measure is the f-score computed as the harmonic mean between the # average precision and the average recall # otherwise (if b.per.example == TRUE) the function returns a list with two elements: # 0) average: the same vector with 7 elements aforementioned # 1) per.example: a named matrix with the precision, recall, specificity and F-measure for each example. # Named rows correspond to examples, named columns correspond respectively to precision, recall (sensitivity), specificity and F-measure. find.best.f <- function(target, pred, n.round=3, f.criterion ="F", verbose=TRUE, b.per.example=FALSE) { x<- apply(target,1,sum); selected <- which(x>0); target <- target[selected,]; pred <- pred[selected,]; pred <- pred/max(pred); pred <- round(pred,n.round); n.examples <- nrow(pred); n.classes <- ncol(pred); thresh <- unique(as.numeric(pred)); thresh <- sort(thresh); best.res <- best <- best.thresh <- 0; i=0; for (t in thresh) { pred.labels <- matrix(numeric(n.examples*n.classes), nrow=n.examples); pred.labels[pred>=t] <-1; res <- F.measure.multilabel(target, pred.labels, b.per.example); if (res$average[f.criterion] > best) { best <- res$average[f.criterion]; best.res <- res; best.thresh <- t; } i <- i+1; if (i%%100 == 0 && verbose) cat("iteration ", i, "\n"); } if(b.per.example){ best.res$average <- c(best.res$average, best.thresh); names(best.res$average)[7] <- "T"; return(best.res); }else{ best.res <- c(best.res$average, best.thresh); names(best.res)[7] <- "T"; return(best.res); } } # Function to select the best hierarchical F-score by choosing an appropriate threshold in the scores # N.B. All the examples having no positive annotations are discarded # Arguments: # target : matrix with the target multilabels. 1 stands for positive, 0 for negative # pred : matrix with the predicted scores. Values are assumed to be postive # n.round : number of rounding digits to be applied to pred (default=3) # f.criterion : character. Type of F-measure to be used to select the best F. There are 2 possibilities: # 1. "F" (default) corresponds to the harmonic mean between the average precision and recall; # 2. "avF" corresponds to the per-example F-score averaged across all the examples. # verbose : boolean. If TRUE (def) the number of iterations are printed on stdout # The pred matrix is rounded according to n.round and all the values of pred are divided by max(pred). # Then all the thresholds corresponding to all the different values included in pred are attempted, and the threshold # leading to the maximum f.measure is selected. # Output: # a vector with 7 elements relative to the best result in terms of the f.measure: precision, recall, specificity, f.measure, av.f.measure, accuracy, thresh # Note: thresh is the selected best threshold # av.f.measure if the f.measure averaged across examples # f.measure is the f-score computed as the harmonic mean between the average precision and the average recall find.best.f.OLD <- function(target, pred, n.round=3, f.criterion ="F", verbose=TRUE) { x<- apply(target,1,sum); selected <- which(x>0); target <- target[selected,]; pred <- pred[selected,]; pred <- pred/max(pred); pred <- round(pred,n.round); n.examples <- nrow(pred); n.classes <- ncol(pred); thresh <- unique(as.numeric(pred)); thresh <- sort(thresh); best.res <- best <- best.thresh <- 0; i=0; for (t in thresh) { pred.labels <- matrix(numeric(n.examples*n.classes), nrow=n.examples); pred.labels[pred>=t] <-1; res <- F.measure.multilabel(target, pred.labels, b.per.example=FALSE); if (res$average[f.criterion] > best) { best <- res$average[f.criterion]; best.res <- res$average; best.thresh <- t; } i <- i+1; if (i%%100 == 0 && verbose) cat("iteration ", i, "\n"); } best.res <- c(best.res, best.thresh); names(best.res)[7] <- "T"; return(best.res); } # Function to select the best hierarchical F-score by choosing an appropriate threshold in the scores. This is the old version. # Arguments: # target : matrix with the target multilabels # pred : matrix with the predicted scores # attempts : number of different thresholds to be attempted to find the best F-score. If attempts=0 all the scores in pred are used as possible threshold # Output: # a vector with 5 elements relative to the best results: precision, recall, specificity, f.measure, accuracy find.best.f.old <- function(target, pred, attempts=500.0) { thresh <- unique(as.numeric(pred)); thresh <- sort(thresh); len.values <- length(thresh); best.res <- best <- 0; i=0; if (attempts==0) { for (t in thresh) { pred.labels <- matrix(numeric(nrow(pred)*ncol(pred)), nrow=nrow(pred)); pred.labels[pred>t] <-1; res <- F.measure.multilabel(target, pred.labels, b.per.example=FALSE); if (res$average[4] > best) { best <- res$average[4]; best.res <- res$average; } i <- i+1; if (i%%10 == 0) cat("attempt ", i, "\n"); } } else { if (len.values>attempts) step <- len.values/attempts else step <- 1; n.step <- 0; while(n.stepthresh.value] <-1; pred.labels[pred<=thresh.value]<-0; res <- F.measure.multilabel(target, pred.labels, b.per.example=FALSE); if (res$average[4] > best) { best <- res$average[4]; best.res <- res$average; } i <- i+1; if (i%%10 == 0) cat("attempt ", i, "\n"); } } return(best.res); } # Usage example: # Here target is a matrix with 0/1 entries of the target multilabels, and pred a numeric matrix with the predicted scores. Attempts is the number of different thresholds tested: larger is the number most accurate is the result. #res <- find.best.f(target, pred, n.round=3);