# Random forest library for processing genetic data
# May 2016
# October 2016
# March 2017
# Last updated 2021

library(randomForest);

# random forest cross-validation
# Input:
# data : a data frame or matrix with the  data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root  of the number of the  features.
# seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed)
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a vector with the cross-validated random forest probabilities.

rf.cv <- function (data, y, kk=5, ntree=500, mtry=round(sqrt(ncol(data))), seed = 0, model.file="", verbose=FALSE)  {

  n.data <- nrow(data);
  n.col <- ncol(data);
  indices.positives <- which(y == 1) ;
  prob <- numeric(n.data);
  names(prob) <- rownames(data);
  rf.list <- vector(mode="list", kk);
  
  folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);

  for (i in 1:kk)  {     
	 # preparation of the training and test data
	 ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
     ind.pool.pos <- integer(0);
	 ind.pool.neg <- integer(0);
     for (j in 1:kk)
	   if (j!=i)  {
 	     ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
 	     ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
 	   }
	 data.train <- data[c(ind.pool.pos, ind.pool.neg),];
	 y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
	 mtry.curr=mtry;
	 if ((mtry == 0) || (mtry > n.col)) {
	   mtry.curr <- round(sqrt(n.col));
	   if (mtry.curr<1)
	     mtry.curr <- 1;
	   if (verbose)
	     cat("Automatic selection of mtry = ", mtry.curr, "\n");
	 } 
	 # training	 
	 if (verbose)
	   cat("Starting training on Fold ", i, "...\n");	 
	 rf <- randomForest(data.train, y.train, ntree = ntree, mtry = mtry.curr);	 
	 rm(data.train); gc();
	 # test
	 data.test <- data[ind.test,];
	 if (verbose)
	   cat("Starting test on Fold ", i, "...\n");
	 prob[ind.test] <- predict(rf, data.test, type="prob")[,"1"]; 
	 if (verbose)
	   cat("End test on Fold ", i, "\n");
	 rm(data.test); 
	 if (model.file=="")
	    rm(rf)
	 else
	    rf.list[[i]] <- rf;	 
	 gc(); 
	 if (verbose)
	    cat("Fold ", i, " done -----\n");
  }
  if (model.file != "")
    save(rf.list, file=model.file);
  return(prob);
}

# random forest cross-validation with univariate correlation-based feature selection
# Input:
# data : a data frame or matrix with the  data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root  of the number of the selected features mutliplied by eta
# eta : multiplicative factor for the default number of tandomly selected features.
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determined by the corr parameter
# corr : absolute value of the correlation to seelct the features. Only features having absolute correlation equal or larger than corr
#        are selected. Default value is 0.2.
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved.
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a list with three components:
#   prob: a vector with the cross-validated random forest probabilities.
#   label : predicted classes (1 for positives 0 for negatives)
#   features : a list with kk components. Each component is a numeric vector storing the indices of the features selected at each step of the CV
rf.corr.cv <- function (data, y, kk=10, ntree=500, mtry=0, eta=1, n.feature=0, corr=0.2, seed = 0, model.file="", 
                        save.features=FALSE, verbose=FALSE)  {

  n.data <- nrow(data);
  indices.positives <- which(y == 1) ;
  prob <- numeric(n.data);
  pred.label <- factor(character(n.data), levels=c("1","0"));
  names(prob) <- names(pred.label) <- rownames(data);
  rf.list <- vector(mode="list", kk);
  selected.features <- vector("list", kk);  # list storing the indices of the features selected at each step of the CV
  
  folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);
  
  # find features with sd=0 and remove them from the data
  xx <- apply(data,2,sd);
  which0 <- which(xx==0);
  if (length(which0)>0)  
    data <- data[,-which0];
  rm(xx);

  for (i in 1:kk)  {     
	 # preparation of the training and test data
	 ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
     ind.pool.pos <- integer(0);
	 ind.pool.neg <- integer(0);
     for (j in 1:kk)
	   if (j!=i)  {
 	     ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
 	     ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
 	 }
	 data.train <- data[c(ind.pool.pos, ind.pool.neg),];
	 y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
	 # selection of the best features
	 # check whether some features have 0 standard deviation
	 #if (length(which0)>0)
	 #  data.train <- data.train[,-which0];	 
	 # computing correlation
	 yy <- ifelse(y.train==1,1,0);
	 yy <-  matrix (yy,ncol=1);
	 res <- cor(yy,data.train);
     SNP.names <- colnames(data.train);
     res <- as.numeric(res);
     names(res) <- SNP.names;
     if (n.feature > 0) 
	    ind.selected <- order(abs(res), decreasing=TRUE)[1:n.feature]
	 else 
	    ind.selected <- c(which(res>=corr), which(res <= -corr));
	 if (save.features)
	   selected.features[[i]] <- ind.selected; 
	 # training	 
	 if (verbose)
	   cat("Starting training on Fold ", i, "...\n");	
	 n.selected.features <- length(ind.selected); 
	 mtry.curr=mtry;
	 if ((mtry == 0) || (mtry > n.selected.features)) {
	   mtry.curr <- round(eta*sqrt(n.selected.features));
	   if (mtry.curr<1)
	     mtry.curr <- 1;
	   if (verbose)
	     cat("Automatical selection of mtry = ", mtry.curr, "\n");
	 } 
	 rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr);	 
	 rm(data.train); gc();
	 # test
	 data.test <- data[ind.test,];
	 #if (length(which0)>0)  # removing features with sd=0 in the training set
	 #  data.test <- data.test[,-which0];
	 if (verbose)
	   cat("Starting test on Fold ", i, "...\n");
	 prob[ind.test] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"]; 
	 pred.label[ind.test] <- predict(rf, data.test[, ind.selected], type="response"); 

     # computing AUROC and AUPRC on the current fold
	 if (verbose) {
       digits=4;
       lab=ifelse(y==1,1,0);
       lab <- lab[ind.test];
       sscurves <- evalmod(scores = prob[ind.test], labels = lab);
       m<-attr(sscurves,"auc",exact=FALSE);
       AUPRC <- round(m[2,"aucs"],digits);  
       AUROC <- round(m[1,"aucs"],digits);  
       cat ("AUROC = ", AUROC, "\n");
       cat ("AUPRC = ", AUPRC, "\n");
	 }	  
	 if (verbose)
	   cat("End test on Fold ", i, "\n");
	 rm(data.test); 
	 if (model.file=="")
	    rm(rf)
	 else
	    rf.list[[i]] <- rf;	 
	 gc(); 
	 if (verbose)
	    cat("Fold ", i, " done -----\n");
  }
  if (model.file != "")
    save(rf.list, file=model.file);
  return(list(prob=prob, label=pred.label, features=selected.features));
}


# random forest cross-validation with univariate t-test-based feature selection
# Input:
# data : a data frame or matrix with the  data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determned by the p.value parameter
# p.value.thresh : Only features having p.value from the t-test lower than p.value.thresh are selected. Default value is 0.001.
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# save.features : logical. If TRUE (default is FALSE), hte features selected at each fold of the CV are saved.
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a vector with the cross-validated random forest probabilities.
rf.ttest.cv <- function (data, y, kk=10, ntree=500, mtry=0, n.feature=0, p.value=0.001, seed = 0, model.file="", 
                        save.features=FALSE, verbose=FALSE)  {

  MIN.SELECTED.FEATURES=5;
  n.data <- nrow(data);
  indices.positives <- which(y == 1) ;
  prob <- numeric(n.data);
  names(prob) <- rownames(data);
  rf.list <- vector(mode="list", kk);
  
  folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);

  for (i in 1:kk)  {     
	 # preparation of the training and test data
	 ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
     ind.pool.pos <- integer(0);
	 ind.pool.neg <- integer(0);
     for (j in 1:kk)
	   if (j!=i)  {
 	     ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
 	     ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
 	 }
	 data.train <- data[c(ind.pool.pos, ind.pool.neg),];
	 y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
	 # selection of the best features
	 # check whether some features have 0 standard deviation
	 xx <- apply(data.train,2,sd);
	 which0 <- which(xx==0);
	 if (length(which0)>0)
	   data.train <- data.train[,-which0];
	 # computing t.test
	 yy <- ifelse(y.train==1,1,0);
	 n.feat <- ncol(data.train);
	 res <- numeric(n.feat);
	 names(res) <- colnames(data.train);
	 res <- apply(data.train, 2, function(x) { t.test(x[yy==1], x[yy==0])$p.value });
	 #for (j in 1:n.feat) 
	 #   res[j] <- t.test(data.train[,j][yy==1], data.train[,j][yy==0])$p.value;
     if (n.feature > 0) 
	    ind.selected <- order(res, decreasing=FALSE)[1:n.feature]
	 else 
	    ind.selected <- which(res<p.value);
	 n.selected.features <- length(ind.selected);
	 if (n.selected.features < MIN.SELECTED.FEATURES)
	   n.selected.features=MIN.SELECTED.FEATURES; 
	 if (verbose)
	   cat("Selected ", n.selected.features, " features \n");
	 # training	 
	 if (verbose)
	   cat("Starting training on Fold ", i, "...\n");		 
	 mtry.curr=mtry;
	 if ((mtry == 0) || (mtry > n.selected.features)) {
	   mtry.curr <- round(sqrt(n.selected.features));
	   if (mtry.curr<1)
	     mtry.curr <- 1;
	   if (verbose)
	     cat("Automatical selection of mtry = ", mtry.curr, "\n");
	 } 
	 rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr);	 
	 rm(data.train); gc();
	 # test
	 data.test <- data[ind.test,];
	 if (length(which0)>0)  # removing features with sd=0 in the training set
	   data.test <- data.test[,-which0];
	 if (verbose)
	   cat("Starting test on Fold ", i, "...\n");
	 p <- predict(rf, data.test[, ind.selected], type="prob")[,"1"]; 

     # computing AUROC and AUPRC on the current fold
     digits=4;
     lab=ifelse(y==1,1,0);
     lab <- lab[ind.test];
     sscurves <- evalmod(scores = p, labels = lab);
     m<-attr(sscurves,"auc",exact=FALSE);
     AUPRC <- round(m[2,"aucs"],digits);  
     AUROC <- round(m[1,"aucs"],digits);  
     cat ("AUROC = ", AUROC, "\n");
     cat ("AUPRC = ", AUPRC, "\n");
	 
	 prob[ind.test] <- p;
	 if (verbose)
	   cat("End test on Fold ", i, "\n");
	 rm(data.test); 
	 if (model.file=="")
	    rm(rf)
	 else
	    rf.list[[i]] <- rf;	 
	 gc(); 
	 if (verbose)
	    cat("Fold ", i, " done -----\n");
  }
  if (model.file != "")
    save(rf.list, file=model.file);
  return(prob);
}


# random forest leave-one-out cross-validation with univariate correlation-based feature selection
# Input:
# data : a data frame or matrix with the  data
# y : a factor with the labels. 0:negative class, 1: positive class.
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features
# n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determned by the corr parameter
# corr : absolute value of the correlation to select the features. Only features having absolute correlation equal or larger than corr
#        are selected. Default value is 0.2.
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved (not implemented)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a list with two components:
#   prob: a vector with the cross-validated random forest probabilities.
#   pred : predicted classes (1 for positives 0 for negatives)
rf.corr.loo <- function (data, y, ntree=500, mtry=0, n.feature=0, corr=0.2,  model.file="", 
                        save.features=FALSE, verbose=FALSE)  {
  
  n.data <- nrow(data);
  prob <- numeric(n.data);
  pred.label <- factor(character(n.data), levels=c("1","0"));
  names(prob) <- rownames(data);
  rf.list <- vector(mode="list", n.data);

  for (i in 1:n.data)  {     
	 # preparation of the training and test data
	 data.test <- data[i,];
	 data.test <- matrix(data.test, nrow=1);  
	 data.train <- data[-i,];
	 y.train <- y[-i];
	 # selection of the best features
	 # check whether some features have 0 standard deviation
	 xx <- apply(data.train,2,sd);
	 which0 <- which(xx==0);
	 if (length(which0)>0)  {
	   data.train <- data.train[,-which0];
	   data.test<-matrix(data.test[, -which0],nrow=1);
	 }
	 # computing correlation
	 yy <- ifelse(y.train==1,1,0);
	 yy <-  matrix (yy,ncol=1);
	 res <- cor(yy,data.train);
     SNP.names <- colnames(data.train);
     res <- as.numeric(res);
     names(res) <- SNP.names;
     if (n.feature > 0)
	    ind.selected <- order(abs(res), decreasing=TRUE)[1:n.feature]
	 else 
	    ind.selected <- c(which(res>=corr), which(res <= -corr));
	 # training	 
	 if (verbose)
	   cat("Starting training on Fold ", i, "...\n");	
	 n.selected.features <- length(ind.selected); 
	 mtry.curr=mtry;
	 if ((mtry == 0) || (mtry > n.selected.features)) {
	   mtry.curr <- round(sqrt(n.selected.features));
	   if (mtry.curr<1)
	     mtry.curr <- 1;
	   if (verbose)
	     cat("Automatical selection of mtry = ", mtry.curr, "\n");
	 } 
	 rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr);	 
	 rm(data.train); gc();
	 # test
	 if (verbose)
	   cat("Starting test on Fold ", i, "...\n");
	 prob[i] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"]; 
	 pred.label[i] <- predict(rf, data.test[, ind.selected], type="response"); 

     # computing AUROC and AUPRC on the current fold
     #digits=4;
     #lab=ifelse(y==1,1,0);
     #lab <- lab[ind.test];
     #sscurves <- evalmod(scores = p, labels = lab);
     #m<-attr(sscurves,"auc",exact=FALSE);
     #AUPRC <- round(m[2,"aucs"],digits);  
     #AUROC <- round(m[1,"aucs"],digits);  
     #cat ("AUROC = ", AUROC, "\n");
     #cat ("AUPRC = ", AUPRC, "\n");
	 
	  
	 if (verbose)  {
	   cat("Example ", i, " : label = ", 2-as.numeric(y[i]), ", label pred = ",  2-as.numeric(pred.label[i]), ", score = ", prob[i], "\n");
	   cat("End test on Fold ", i, "\n");
	 }
	 rm(data.test); 
	 if (model.file=="")
	    rm(rf)
	 else
	    rf.list[[i]] <- rf;	 
	 gc(); 
	 if (verbose)
	    cat("Fold ", i, " done -----\n");
  }
  if (model.file != "")
    save(rf.list, file=model.file);
  return(list(prob=prob, label=pred.label));
}


###############################################################################################à
###############################################################################################

# random forest cross-validation with embedded variable importance feature selection.
# Selection of the features is performed by internal oob (oob on the training data).
# Input:
# data : a data frame or matrix with the  data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 10)
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root  of the number of the selected features mutliplied by eta
# eta : multiplicative factor for the default number of randomly selected features.
# sel.type : integer. If 1 the mean decrease in accuracy is used,  if 2 (def) the mean decrease in node impurity (Gini index) is used instead.
# (1=mean decrease in accuracy, 2=mean decrease in node impurity)
# n.feature : number of the features to be selected in the training set through the variable importance measure (def.= 1000)
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved.
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a list with three components:
#   prob: a vector with the cross-validated random forest probabilities.
#   label : predicted classes (1 for positives 0 for negatives)
#   features : a list with kk components. Each component is a numeric vector storing the indices of the features selected at each step of the CV
rf.vi.oob.cv <- function (data, y, kk=10, ntree=500, mtry=0, eta=1, sel.type=2, n.feature=1000, seed = 0, model.file="", 
                        save.features=FALSE, verbose=FALSE)  {

  n.data <- nrow(data);
  indices.positives <- which(y == 1) ;
  prob <- numeric(n.data);
  pred.label <- factor(character(n.data), levels=c("1","0"));
  names(prob) <- names(pred.label) <- rownames(data);
  rf.list <- vector(mode="list", kk);
  selected.features <- vector("list", kk);  # list storing the indices of the features selected at each step of the CV
  
  folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);

  for (i in 1:kk)  {     
	 # preparation of the training and test data
	 ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
     ind.pool.pos <- integer(0);
	 ind.pool.neg <- integer(0);
     for (j in 1:kk)
	   if (j!=i)  {
 	     ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
 	     ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
 	 }
	 data.train <- data[c(ind.pool.pos, ind.pool.neg),];
	 y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
	 tot.feature <- ncol(data.train);
	 # selection of the best features using variable importance measures
	 # mtry setting for oob on the training set
	 if (verbose)
	   cat("Starting oob estimation on Fold ", i, "...\n");
	 mtry.curr=mtry;
	 if ((mtry == 0) || (mtry > tot.feature)) {
	   mtry.curr <- round(eta*sqrt(tot.feature));
	   if (mtry.curr<1)
	     mtry.curr <- 1;
	   if (verbose)
	     cat("Automatical selection of mtry for oob selection = ", mtry.curr, "\n");
	 } 
	 rf.oob <- randomForest(data.train, y.train, ntree = ntree, mtry = mtry.curr, importance=T);
     # computing variable importance and selecting the top variables
	 if (verbose) {
	   cat("OOB error estimate = ", rf.oob$err[ntree,1], "\n\n");
	   cat("Computing variable importance on Fold ", i, "...\n");	   
	 }
	 var.imp <- importance(rf.oob, type=sel.type);
     var.imp <- as.numeric(var.imp);
	 sorted.indices <- order(var.imp,decreasing = T);
     indices.selected <- sorted.indices[1:n.feature];
     data.train.sel <- data.train[, indices.selected]; 
	 rm(data.train, var.imp); gc();	 
	 if (save.features)
	   selected.features[[i]] <- indices.selected;
	 # training	 
	 if (verbose)
	   cat("Starting training on Fold ", i, "...\n");	
	 n.selected.features <- length(indices.selected); 
	 # setting of mtry for training
	 mtry.curr=mtry;
	 if ((mtry == 0) || (mtry > n.selected.features)) {
	   mtry.curr <- round(eta*sqrt(n.selected.features));
	   if (mtry.curr<1)
	     mtry.curr <- 1;
	   if (verbose)
	     cat("Automatical selection of mtry = ", mtry.curr, "\n");
	 } 
	 rf <- randomForest(data.train.sel, y.train, ntree = ntree, mtry = mtry.curr);	
	 # test
	 data.test <- data[ind.test, indices.selected];
	 #if (length(which0)>0)  # removing features with sd=0 in the training set
	 #  data.test <- data.test[,-which0];
	 if (verbose)
	   cat("Starting test on Fold ", i, "...\n");
	 prob[ind.test] <- predict(rf, data.test, type="prob")[,"1"]; 
	 pred.label[ind.test] <- predict(rf, data.test, type="response"); 

     # computing AUROC and AUPRC on the current fold
	 if (verbose) {
       digits=4;
       lab=ifelse(y==1,1,0);
       lab <- lab[ind.test];
       sscurves <- evalmod(scores = prob[ind.test], labels = lab);
       m<-attr(sscurves,"auc",exact=FALSE);
       AUPRC <- round(m[2,"aucs"],digits);  
       AUROC <- round(m[1,"aucs"],digits);  
       cat ("AUROC = ", AUROC, "\n");
       cat ("AUPRC = ", AUPRC, "\n");
	 }	  
	 if (verbose)
	   cat("End test on Fold ", i, "\n");
	 rm(data.test);
	 if (model.file=="")
	    rm(rf)
	 else
	    rf.list[[i]] <- rf;	 
	 gc(); 
	 if (verbose)
	    cat("Fold ", i, " done -----\n");
  }
  if (model.file != "")
    save(rf.list, file=model.file);
  return(list(prob=prob, label=pred.label, features=selected.features));
}


#######################################################################################
####   Stable feature selection methods                                              ##
#######################################################################################


# Stable selection of features with correlation based univariate method
# The most selected features at each cross-validation are selected. 
# For the selection both training ad test data can be used.
# If training data are used, at each iteration only 1 fold is left out; if test data are used only the left out fold is used at each iteration
# Input:
# data : a data frame or matrix with the  data: rows are examples and columns the features
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 10)
# rep : number of repetitions of the cross validation
# top.feature : value indicating the number of top ranked features to be selected
# train : boolean. If  TRUE (def.) trainnig data are used for feature selection, otherwise test data (that is only the left out fold)
# min.frequency : minimum frequency a feature needs to be selected (a value between 0 and 1, def=1)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# Output:
# a list woth two componenents:
# ind.selected: a vector with the indices of the selected features 
# scores : a vector with the corresponiding scores
stable.fs.corr <- function(data, y, kk=10, rep=1, top.feature=1000, min.frequency=1, seed = 0, train=TRUE, verbose=FALSE) {

  n.data <- nrow(data);
  n.features <- ncol(data);
  indices.positives <- which(y == 1) ;
  freq <- numeric(n.features);  # vector collecting the number of times each feature is selected
  names(freq) <- colnames(data);
  
  for (r in 1:rep)  {
     if (verbose)
	     cat("Starting feature selection repetition ", r, "\n");
     folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed+r-1);
     for (i in 1:kk)  {     
	   # preparation of the training data
	   ind.out <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
       if (train) {
	     data.train <- data[- ind.out,];
	     y.train <- y[-ind.out];
       } else {
         data.train <- data[ind.out,];
	     y.train <- y[ind.out];      
       }
	   # selection of the best features  through correlation
	   if (verbose)
	     cat("Starting feature selection on Fold ", i, "\n");
	   yy <- ifelse(y.train==1,1,0);
	   yy <-  matrix (yy,ncol=1);
	   res <- cor(yy,data.train);
	   ind.selected <- order(abs(res), decreasing=TRUE)[1:top.feature];
	   freq[ind.selected] <- freq[ind.selected] + 1;
     }
   }
   ind.selected <- which(freq >= (min.frequency*kk*rep));
   names(ind.selected) <- names(freq)[ind.selected];
   if (length(ind.selected) < 4)
       warning("Too few features selected \n");
   if (verbose)
      cat ("Selected ", length(ind.selected), " features \n");
   scores <- freq[ind.selected]/(kk*rep);
   names(scores) <- names(ind.selected);
   return(list(ind.selected=ind.selected, scores=scores));
}

# random forest cross-validation with  feature selection performed through "stable" selection of features with correlation based univariate methods
# Input:
# data : a data frame or matrix with the  data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# rep : number of repetitions of the cross validation for the feature selection
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root  of the number of the selected features mutliplied by eta
# eta : multiplicative factor for the default number of randomly selected features.
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# top.feature : value indicating the number of top ranked features to be selected
# min.frequency : minimum frequency that a features needs to be selected (a value betwee 0 and 1, def.=1)
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# train : boolean. If  TRUE (def.) trainnig data are used for feature selection, otherwise test data (that is only the left out fold)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a list with three components:
#   prob: a vector with the cross-validated random forest probabilities.
#   label : predicted classes (1 for positives 0 for negatives)
#   features : a vector with the indices of the selected features
rf.stable.fs.corr.cv <- function (data, y, kk=10, rep=1, ntree=500, mtry=0, eta=1, top.feature=1000, min.frequency=1, seed = 0, model.file="", train=TRUE, verbose=FALSE)  {

  n.data <- nrow(data);
  indices.positives <- which(y == 1) ;
  prob <- numeric(n.data);
  pred.label <- factor(character(n.data), levels=c("1","0"));
  names(prob) <- names(pred.label) <- rownames(data);
  rf.list <- vector(mode="list", kk);
  
  # stable feature selection
  ind.selected <- stable.fs.corr(data, y, kk=kk, rep=rep, top.feature=top.feature, min.frequency=min.frequency, seed = seed, train=train, verbose=verbose);
  
  # cross-validation with the selected features
  folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);
  for (i in 1:kk)  {     
	 # preparation of the training and test data
	 ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
     ind.pool.pos <- integer(0);
	 ind.pool.neg <- integer(0);
     for (j in 1:kk)
	   if (j!=i)  {
 	     ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
 	     ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
 	 }
	 data.train <- data[c(ind.pool.pos, ind.pool.neg),];
	 y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
	 # training	 
	 if (verbose)
	   cat("Starting training on Fold ", i, "...\n");	
	 n.selected.features <- length(ind.selected); 
	 mtry.curr=mtry;
	 if ((mtry == 0) || (mtry > n.selected.features)) {
	   mtry.curr <- round(eta*sqrt(n.selected.features));
	   if (mtry.curr<1)
	     mtry.curr <- 1;
	   if (verbose)
	     cat("Automatic selection of mtry = ", mtry.curr, "\n");
	 } 
	 rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr);	 
	 rm(data.train); gc();
	 # test
	 data.test <- data[ind.test,];
	 #if (length(which0)>0)  # removing features with sd=0 in the training set
	 #  data.test <- data.test[,-which0];
	 if (verbose)
	   cat("Starting test on Fold ", i, "...\n");
	 prob[ind.test] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"]; 
	 pred.label[ind.test] <- predict(rf, data.test[, ind.selected], type="response"); 

     # computing AUROC and AUPRC on the current fold
	 if (verbose) {
       digits=4;
       lab=ifelse(y==1,1,0);
       lab <- lab[ind.test];
       sscurves <- evalmod(scores = prob[ind.test], labels = lab);
       m<-attr(sscurves,"auc",exact=FALSE);
       AUPRC <- round(m[2,"aucs"],digits);  
       AUROC <- round(m[1,"aucs"],digits);  
       cat ("AUROC = ", AUROC, "\n");
       cat ("AUPRC = ", AUPRC, "\n");
	 }	  
	 if (verbose)
	   cat("End test on Fold ", i, "\n");
	 rm(data.test); 
	 if (model.file=="")
	    rm(rf)
	 else
	    rf.list[[i]] <- rf;	 
	 gc(); 
	 if (verbose)
	    cat("Fold ", i, " done -----\n");
  }
  if (model.file != "")
    save(rf.list, file=model.file);
  return(list(prob=prob, label=pred.label, features=ind.selected));
}


#######################################################################################
####   Permutation tests                                                             ##
#######################################################################################


# Non parametric permutation test to asses the significance of the prediction
# This function evaluates the statistical significance of the predictions through a non parameteric permutation of the labels.
# Labels are permuted n times and each time the cross-validated accuracy and F-score performance are compared with the perfromance achieved with the true labels.
# A p-value is computed  as the frequency for which a prediction obtained with a random permutation is equal or better than the prediction obtained with the tru labels
# Input:
# data : a data frame or matrix with the  data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# nperm : number of permutations
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf. If 0 (def) is the square root of the number of features
# seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a vector with two elements: the p.value computed with respect to the F-score and the p-value computed with respect to the accuracy.

permutation.test.rf.cv <- function (data, y, kk=5, nperm=100, ntree=500, mtry=0, seed = 0,  verbose=FALSE)  {
  
  # computing the accuracy and F-score with the true labels
  true.prob <- rf.cv (data=data, y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE); 

  pred=ifelse(true.prob>=0.5,1,0);
  lab=ifelse(y==1,1,0);
  res <- F.measure.single(pred,lab);
  true.F <- res["F"];
  true.A <- res["A"];
  if (verbose) 
	  cat("True prediction  : ", res, "\n");
  
  # computing the accuracy and F-score with the permuted labels
  
  perm.F <- perm.A <- numeric(nperm);
  for (i in 1:nperm) {
    y.perm <- sample(y);
	perm.prob <- rf.cv (data=data, y=y.perm, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE);
	pred=ifelse(perm.prob>=0.5,1,0);
    lab=ifelse(y.perm==1,1,0);
	res <- F.measure.single(pred,lab);
	perm.F[i] <- res["F"];
	perm.A[i] <- res["A"];
	if (verbose) 
	  cat("Permuted prediction ", i, " : ", res, "\n");
  }
  p.value <- numeric(2);
  names(p.value) <- c("F", "A");
  p.value["F"] <- sum(perm.F > true.F)/nperm;
  p.value["A"] <- sum(perm.A > true.A)/nperm;
  return(p.value);
}
  
  
# Non parametric permutation test to assess the significance of a given signature
# This function evaluates the statistical significance of a given signature (subset of SNPs) 
# Given a signature S with cardinality n=|S|, a cross-validation is performed other m times using each time a set of SNPs S' with |S'| = n randomly extracted from the full set of the signature. 
# Then the cross-validated accuracy and F-score performance obtained with the random signatures are compared with the perfromance achieved with the true signature.
# A p-value is computed  as the frequency for which a prediction obtained with a random signature is equal or better than the prediction obtained with the true signature
# Input:
# data : a data frame or matrix with the  data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# ind.signature: vector of the indices of the features (columns of data) corresponding to the true signature.
# nrep : number of repetitions of the random signature
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf (def: square root of the number of features)
# seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a vector with two elements: the p.value computed with respect to the F-score and the p-value computed with respect to the accuracy.

signature.permutation.test.rf.cv <- function (data, y, kk=5, ind.signature, nrep=100, ntree=500, mtry=0, seed = 0,  verbose=FALSE)  {

  # computing the accuracy and F-score with the true signature
  data.true <- data[, ind.signature];
  true.prob <- rf.cv (data=data.true, y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE); 

  pred=ifelse(true.prob>=0.5,1,0);
  lab=ifelse(y==1,1,0);
  res <- F.measure.single(pred,lab);
  true.F <- res["F"];
  true.A <- res["A"];
  if (verbose) 
	  cat("True signature prediction  : ", res, "\n");
	  
  # computing the accuracy and F-score with the random signatures
  n.feat <- ncol(data);
  n.feat.sign <- length(ind.signature);  
  perm.F <- perm.A <- numeric(nrep);
  for (i in 1:nrep) {
    rand.sign <- sample(1:n.feat, n.feat.sign);
	rand.prob <- rf.cv (data=data[,rand.sign], y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE);	
	pred=ifelse(rand.prob>=0.5,1,0);   
	res <- F.measure.single(pred,lab);
	perm.F[i] <- res["F"];
	perm.A[i] <- res["A"];
	if (verbose) 
	  cat("Random signature prediction ", i, " : ", res, "\n");
  }
  p.value <- numeric(2);
  names(p.value) <- c("F", "A");
  p.value["F"] <- sum(perm.F > true.F)/nrep;
  p.value["A"] <- sum(perm.A > true.A)/nrep;
  return(p.value);
}


#################################
#  Utility functions
#################################


######################################################################
# Function to generate data for the stratified cross-validation.
# Input:
# examples : indices of the examples (a vector of integer)
# positives: vector of integer. Indices of the positive examples. The indices refer to the indices of examples
# k : number of folds (def = 5)
# seed : seed of the random generator (def=0). If is set to 0 no initiazitation is performed
# Ouptut:
# a list with 2 two components
#   - fold.non.positives : a list with k components. Each component is a vector with the indices of the non positive elements of the fold
#   - fold.positives : a list with k components. Each component is a vector with the indices of the positive elements of the fold
# N.B.: in both elements indices refer to the values of the examples vector	 	 
do.stratified.cv.data <- function(examples, positives, k=5, seed=NULL) {
  
  if (seed!=0)
     set.seed(seed);
  fold.non.positives <- fold.positives <- list();
  for (i in 1:k) {
    fold.non.positives[[i]] <- integer(0);
    fold.positives[[i]] <- integer(0);
  }
  # examples <- 1:n;
  non.positives <- setdiff(examples,positives);
  # non.positives <- examples[-positives];
  non.positives <- sample(non.positives);
  positives <- sample(positives);
  n.positives <- length(positives);
  resto.positives <- n.positives%%k;
  n.pos.per.fold <- (n.positives - resto.positives)/k;
  n.non.positives <- length(non.positives);
  resto.non.positives <- n.non.positives%%k;
  n.non.pos.per.fold <- (n.non.positives - resto.non.positives)/k;
  j=1; 
  if (n.non.pos.per.fold > 0)
    for (i in 1:k) {
      fold.non.positives[[i]] <- non.positives[j:(j+n.non.pos.per.fold-1)];
      j <- j + n.non.pos.per.fold;
    }
  j.pos=1;  
  if (n.pos.per.fold > 0)
    for (i in 1:k) {
      fold.positives[[i]] <- positives[j.pos:(j.pos+n.pos.per.fold-1)];
      j.pos <- j.pos + n.pos.per.fold;
    }
  
  if (resto.non.positives > 0)
    for (i in k:(k-resto.non.positives+1)) {
      fold.non.positives[[i]] <- c(fold.non.positives[[i]], non.positives[j]);
      j <- j + 1;
    }
  
  if (resto.positives > 0) 
    for (i in 1:resto.positives) {
      fold.positives[[i]] <- c(fold.positives[[i]], positives[j.pos]);
      j.pos <- j.pos + 1;
    }
  
  return(list(fold.non.positives=fold.non.positives, fold.positives=fold.positives));
}


# Function to compute the F-measure for a single class
# Input
# pred : factor or vector of the predicted labels (1 stands for positve, 0 for negative)
# labels : factor or vector of the true labels (1 stands for positve, 0 for negative)
# Note that 0 level stands for negative and 1 for positive.
# In general the first level is negative and the second positive
# Output:
# a named vector with five elements:
# P: precision
# R : recall (sensitivity)
# S : specificity
# F : F measure
# A : 0/1 accuracy
F.measure.single <- function(pred,labels) {
      
      if (length(pred)!=length(labels))
         stop("F.measure: lengths of true and predicted labels do not match.");
      res <- numeric(5);
      names(res) <- c("P", "R", "S", "F", "A");
      pred.0 <- which(pred==0);
      pred.1 <- which(pred==1);
      labels.0<- which(labels==0);
      labels.1<- which(labels==1);
      TN <- length(intersect(pred.0, labels.0));
      TP <- length(intersect(pred.1, labels.1));
      FN <- length(intersect(pred.0, labels.1));
      FP <- length(intersect(pred.1, labels.0));
           
      acc <- (TP+TN)/length(labels);
      if ((TP+FP) == 0)
        precision <- 0
      else 
        precision <- TP/(TP+FP);
      if ((TP+FN) == 0)
        recall <- 0
      else
        recall <- TP/(TP+FN);
      if ((TN+FP) == 0)
        specificity <- 0
      else
        specificity <- TN/(TN+FP);
      if ((precision+recall) == 0)
         F <- 0
      else
         F = 2 *(precision*recall) / (precision+recall); 
      res <- c(precision,recall,specificity,F,acc);
      names(res) <- c("P", "R", "S", "F", "A");
      return (res);
}


# Function to compute the best F score for a single class.
# Input:
# scores : vector of the predicted scores 
# lab : vector of the true labels: 1 positive, 0 negative.
# Output:
# a named vector with six elements:
# P: precision
# R : recall (sensitivity)
# S : specificity
# F : F measure
# A : 0/1 accuracy
# T : best threshold
Fmax <- function (scores, lab) {
   max.F=0;
   best=0;
   unique.scores <- unique(scores); 
   for (t in unique.scores) {
      pred <- ifelse(scores>=t,1,0);
      res <- F.measure.single(pred,lab);
      if (res["F"] > max.F) {
        max.F <- res["F"];
        best <- c(res,t);     
      }  
   }
   names(best)[length(best)] <- "T";
   return(best);
}

# Function to compute the best Accruacy for a single class.
# Input:
# scores : vector of the predicted scores 
# lab : vector of the true labels: 1 positive, 0 negative.
# Output:
# a named vector with six elements:
# P: precision
# R : recall (sensitivity)
# S : specificity
# F : F measure
# A : 0/1 accuracy
# T : best threshold
Accmax <- function (scores, lab) {
   max.Acc=0;
   best=0;
   unique.scores <- unique(scores); 
   for (t in unique.scores) {
      pred <- ifelse(scores>=t,1,0);
      res <- F.measure.single(pred,lab);
      if (res["A"] > max.Acc) {
        max.Acc <- res["A"];
        best <- c(res,t);     
      }  
   }
   names(best)[length(best)] <- "T";
   return(best);
}