##########################################################################
# Functions to compute Kiritchenko-like multi-label F-scores
# March 2016
##########################################################################

# Generic function for computing, precision, recall, specificity and F-measure for multiclass multilabel classification
setGeneric("F.measure.multilabel", 
                 function(target, predicted, b.per.example=FALSE) standardGeneric("F.measure.multilabel"));

# Method that computes precision, recall, specificity and F-measure for multiclass multilabel classification
# Both the target e predicted matrices have a number of rows equal to the number of examples
# and a number of columns equal to the number of the classes.
# Input:
# target : matrix with the target multilabels. 1 entries correspond to positives, 0 to negatives
# predicted : matrix with the predicted multilabels. 1 entries correspond to postives, 0 to negatives
# b.per.example : if TRUE (def: FALSE) precision recall and F-measure are returned for each example, otherwise only the averages are returned.
# Output :
# if  b.per.example == FALSE the function returns a list with a single element average: a named vector with average  precision (P), recall (R), specificity (S)  F-measure (F), average F-measure (avF) and accuracy (A) across examples;
# otherwise it returns a list with  two elements:
# average :  a named vector with the average precision, recall, specificity F-measure and av.F-measure across examples.
#            The elements correspond respectively to the average precision, recall, specificity; F-measure is the 
#            F-measure computed as the harmonic mean between the average precision and recall;
#            av.f.measure is the f.measure computed as the average across examples.
# per.example : a named matrix with the precision, recall, specificity and F-measure for each example.
#                       Named rows correspond to examples,
#                       named columns correspond respectively to precision, recall (sensitivity), specificity and F-measure.
setMethod("F.measure.multilabel", signature(target="matrix", predicted="matrix"),
  function(target, predicted, b.per.example=FALSE) { 
       n.examples <- nrow(target);
       n.classes <- ncol(target);
       if ((n.examples!=nrow(predicted)) || (n.classes!=ncol(predicted)))
          stop ("F.measure.multilabel: number of rows or columns do not match between target and predicted classes");
	  
       z <- target + predicted;
       TP <- apply(z, 1, function(x)  {
                            return(sum(x==2));
                          });
       TN <- apply(z, 1, function(x)  {
                            return(sum(x==0));
                          });
       z <- predicted - target;
       FP <- apply(z, 1, function(x)  {
                            return(sum(x==1));
                          });
       FN <- apply(z, 1, function(x)  {
                            return(sum(x== -1));
                          });
       rm(z);
       n <- sum(TP)+sum(TN)+sum(FN)+sum(FP);
       if ( n != (n.examples*n.classes)) { 
	       cat("n = ", n, "\n n.examples = ", n.examples, "\n n.classes = ", n.classes, "\n");
		   cat (" sum(TP) = ", sum(TP), "\n sum(TN) = ", sum(TN), "\n sum(FN) = ", sum(FN), "\n sum(FP) = ", sum(FP), "\n");
           warning("F.measure.multilabel: Something went wrong in F-measure)");
	   }
	   
       P <- TP+FP;
       P[which(P==0)] <- 1;  # to avoid division by 0 in precision
       
       sum.TP.FN <- TP+FN;
       sum.TN.FP <- TN+FP;
       
        sum.TP.FN[which(sum.TP.FN==0)] <- 1;  # to avoid division by 0 in recall
        sum.TN.FP[which(sum.TN.FP==0)] <- 1;  # to avoid division by 0 in specificity
          
       precision <- TP/P;
       recall <- TP/sum.TP.FN;
       specificity <- TN/sum.TN.FP;
       
       prec.rec <- precision+recall;
       prec.rec[which(prec.rec==0)] <- 1;  # to avoid division by 0 for f.measure
       f.measure <- (2*precision*recall)/prec.rec;
       accuracy <- (TP+TN)/n.classes;
       
       av.precision <- sum(precision)/n.examples; 
       av.recall <- sum(recall)/n.examples; 
       av.specificity <- sum(specificity)/n.examples; 
	   av.prec.rec <- av.precision+av.recall;
	   if (av.prec.rec == 0)  av.prec.rec <- 1;
	   overall.av.f.measure <- (2*av.precision*av.recall)/av.prec.rec;
       av.f.measure <- sum(f.measure)/n.examples; 
       av.accuracy  <- sum(accuracy)/n.examples; 
       
       average <- c(av.precision, av.recall, av.specificity, overall.av.f.measure, av.f.measure,av.accuracy);
       names(average) <- c("P", "R", "S", "F", "avF", "A");
       
       if (b.per.example == TRUE)  {
          per.example <- cbind(precision, recall, specificity, f.measure, accuracy);
	  colnames(per.example) <- c("P", "R", "S", "F","A");
          return (list(average=average, per.example=per.example))
       } else
          return (list(average=average));
   } 
)


# Function to select the best hierarchical F-score by choosing an appropriate threshold in the scores
# N.B. All the examples having no positive annotations are discarded
#  Arguments:
# target : matrix with the target multilabels. 1 stands for positive, 0 for negative
# pred : matrix with the predicted scores. Values are assumed to be positive 
# n.round : number of rounding digits to be applied to pred (default=3)
# f.criterion : character. Type of F-measure to be used to select the best F. There are 2 possibilities: "F" (default) corresponds to the harmonic mean between the average precision and recall; "avF" corresponds to the per-example F-score averaged across all the examples.
# verbose : boolean. If TRUE (def) the number of iterations are printed on stdout
# The pred matrix is rounded according to n.round and all the values of pred are divided by max(pred). Then all the thresholds corresponding to all the different values included in pred are attempted, and the threshold leading to the maximum f. measure is selected.
# Output:
# a vector with 7 elements relative to the best result in terms of the f.measure:   precision, recall, specificity, f.measure, av.f.measure, accuracy, thresh
# Note: thresh is the selected best threshold   
# av.f.measure if the f.measure averaged across examples
# f.measure is the f-score computed as the harmonic mean between the average precision and the average recall   
find.best.f <- function(target, pred, n.round=3, f.criterion ="F", verbose=TRUE)  {
  
  x<- apply(target,1,sum);
  selected <- which(x>0);
  target <- target[selected,];
  pred <- pred[selected,];
  pred <- pred/max(pred);
  pred <- round(pred,n.round);
  n.examples <- nrow(pred);
  n.classes <- ncol(pred);
  
  thresh <- unique(as.numeric(pred));
  thresh <- sort(thresh);
  best.res <- best <- best.thresh <- 0;
  i=0;
  for (t in thresh) {
    pred.labels <- matrix(numeric(n.examples*n.classes), nrow=n.examples);
    pred.labels[pred>=t] <-1;
    res <- F.measure.multilabel(target, pred.labels, b.per.example=FALSE);
    if (res$average[f.criterion] > best) {
       best <- res$average[f.criterion];
       best.res <- res$average;  
       best.thresh <- t;
    }
    i <- i+1;
    if (i%%100 == 0  && verbose) 
      cat("iteration ", i,  "\n");
  }
  best.res <- c(best.res, best.thresh);
  names(best.res)[7] <- "T"; 
  return(best.res);
}


# Function to select the best hierarchical F-score by choosing an appropriate threshold in the scores. This is the old version.
#  Arguments:
# target : matrix with the target multilabels
# pred : matrix with the predicted scores
# attempts : number of different thresholds to be attempted to find the best F-score. If attempts=0 all the scores in pred are used as possible threshold
# Output:
# a vector with 5 elements relative to the best results:   precision, recall, specificity, f.measure, accuracy      
find.best.f.old <- function(target, pred, attempts=500.0)  {

  thresh <- unique(as.numeric(pred));
  thresh <- sort(thresh);
  len.values <- length(thresh);
  best.res <- best <- 0;
  i=0;
  if (attempts==0) {
     for (t in thresh) {
        pred.labels <- matrix(numeric(nrow(pred)*ncol(pred)), nrow=nrow(pred));
        pred.labels[pred>t] <-1;
        res <- F.measure.multilabel(target, pred.labels, b.per.example=FALSE);
        if (res$average[4] > best) {
            best <- res$average[4];
            best.res <- res$average;  
        }
        i <- i+1;
        if (i%%10 == 0) 
           cat("attempt ", i,  "\n");
     }   
  } else {
     if (len.values>attempts) 
   	step <- len.values/attempts  else   step <- 1; 
     n.step <- 0;
     while(n.step<attempts) {
       n.step <- n.step + 1;
       thresh.value <- thresh[floor(n.step*step)];
       pred.labels <- matrix(numeric(nrow(pred)*ncol(pred)), nrow=nrow(pred));
       pred.labels[pred>thresh.value] <-1;
       pred.labels[pred<=thresh.value]<-0;
       res <- F.measure.multilabel(target, pred.labels, b.per.example=FALSE);
       if (res$average[4] > best) {
   	  best <- res$average[4];
   	  best.res <- res$average;  
       }
       i <- i+1;
       if (i%%10 == 0) 
   	 cat("attempt ", i,  "\n");
     }
  }
  return(best.res);
}


# Usage example:
# Here target is a matrix with 0/1 entries of the target multilabels, and pred a numeric matrix with the predicted scores. Attempts is the number of different thresholds tested: larger is the number most accurate is the result.
#res <- find.best.f(target, pred, n.round=3);