# Random forest library for processing genetic data # May 2016 # October 2016 # March 2017 # Last updated 2021 library(randomForest); # random forest cross-validation # Input: # data : a data frame or matrix with the data # y : a factor with the labels. 0:negative class, 1: positive class. # kk : number of folds (def: 5) # ntree : number of trees of the rf (def: 500) # mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the features. # seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed) # model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved. # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # Output: # a vector with the cross-validated random forest probabilities. rf.cv <- function (data, y, kk=5, ntree=500, mtry=round(sqrt(ncol(data))), seed = 0, model.file="", verbose=FALSE) { n.data <- nrow(data); n.col <- ncol(data); indices.positives <- which(y == 1) ; prob <- numeric(n.data); names(prob) <- rownames(data); rf.list <- vector(mode="list", kk); folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed); for (i in 1:kk) { # preparation of the training and test data ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]); ind.pool.pos <- integer(0); ind.pool.neg <- integer(0); for (j in 1:kk) if (j!=i) { ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]); ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]); } data.train <- data[c(ind.pool.pos, ind.pool.neg),]; y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg)))); mtry.curr=mtry; if ((mtry == 0) || (mtry > n.col)) { mtry.curr <- round(sqrt(n.col)); if (mtry.curr<1) mtry.curr <- 1; if (verbose) cat("Automatic selection of mtry = ", mtry.curr, "\n"); } # training if (verbose) cat("Starting training on Fold ", i, "...\n"); rf <- randomForest(data.train, y.train, ntree = ntree, mtry = mtry.curr); rm(data.train); gc(); # test data.test <- data[ind.test,]; if (verbose) cat("Starting test on Fold ", i, "...\n"); prob[ind.test] <- predict(rf, data.test, type="prob")[,"1"]; if (verbose) cat("End test on Fold ", i, "\n"); rm(data.test); if (model.file=="") rm(rf) else rf.list[[i]] <- rf; gc(); if (verbose) cat("Fold ", i, " done -----\n"); } if (model.file != "") save(rf.list, file=model.file); return(prob); } # random forest cross-validation with univariate correlation-based feature selection # Input: # data : a data frame or matrix with the data # y : a factor with the labels. 0:negative class, 1: positive class. # kk : number of folds (def: 5) # ntree : number of trees of the rf (def: 500) # mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features mutliplied by eta # eta : multiplicative factor for the default number of tandomly selected features. # seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed) # n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determined by the corr parameter # corr : absolute value of the correlation to seelct the features. Only features having absolute correlation equal or larger than corr # are selected. Default value is 0.2. # model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved. # save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved. # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # Output: # a list with three components: # prob: a vector with the cross-validated random forest probabilities. # label : predicted classes (1 for positives 0 for negatives) # features : a list with kk components. Each component is a numeric vector storing the indices of the features selected at each step of the CV rf.corr.cv <- function (data, y, kk=10, ntree=500, mtry=0, eta=1, n.feature=0, corr=0.2, seed = 0, model.file="", save.features=FALSE, verbose=FALSE) { n.data <- nrow(data); indices.positives <- which(y == 1) ; prob <- numeric(n.data); pred.label <- factor(character(n.data), levels=c("1","0")); names(prob) <- names(pred.label) <- rownames(data); rf.list <- vector(mode="list", kk); selected.features <- vector("list", kk); # list storing the indices of the features selected at each step of the CV folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed); # find features with sd=0 and remove them from the data xx <- apply(data,2,sd); which0 <- which(xx==0); if (length(which0)>0) data <- data[,-which0]; rm(xx); for (i in 1:kk) { # preparation of the training and test data ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]); ind.pool.pos <- integer(0); ind.pool.neg <- integer(0); for (j in 1:kk) if (j!=i) { ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]); ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]); } data.train <- data[c(ind.pool.pos, ind.pool.neg),]; y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg)))); # selection of the best features # check whether some features have 0 standard deviation #if (length(which0)>0) # data.train <- data.train[,-which0]; # computing correlation yy <- ifelse(y.train==1,1,0); yy <- matrix (yy,ncol=1); res <- cor(yy,data.train); SNP.names <- colnames(data.train); res <- as.numeric(res); names(res) <- SNP.names; if (n.feature > 0) ind.selected <- order(abs(res), decreasing=TRUE)[1:n.feature] else ind.selected <- c(which(res>=corr), which(res <= -corr)); if (save.features) selected.features[[i]] <- ind.selected; # training if (verbose) cat("Starting training on Fold ", i, "...\n"); n.selected.features <- length(ind.selected); mtry.curr=mtry; if ((mtry == 0) || (mtry > n.selected.features)) { mtry.curr <- round(eta*sqrt(n.selected.features)); if (mtry.curr<1) mtry.curr <- 1; if (verbose) cat("Automatical selection of mtry = ", mtry.curr, "\n"); } rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr); rm(data.train); gc(); # test data.test <- data[ind.test,]; #if (length(which0)>0) # removing features with sd=0 in the training set # data.test <- data.test[,-which0]; if (verbose) cat("Starting test on Fold ", i, "...\n"); prob[ind.test] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"]; pred.label[ind.test] <- predict(rf, data.test[, ind.selected], type="response"); # computing AUROC and AUPRC on the current fold if (verbose) { digits=4; lab=ifelse(y==1,1,0); lab <- lab[ind.test]; sscurves <- evalmod(scores = prob[ind.test], labels = lab); m<-attr(sscurves,"auc",exact=FALSE); AUPRC <- round(m[2,"aucs"],digits); AUROC <- round(m[1,"aucs"],digits); cat ("AUROC = ", AUROC, "\n"); cat ("AUPRC = ", AUPRC, "\n"); } if (verbose) cat("End test on Fold ", i, "\n"); rm(data.test); if (model.file=="") rm(rf) else rf.list[[i]] <- rf; gc(); if (verbose) cat("Fold ", i, " done -----\n"); } if (model.file != "") save(rf.list, file=model.file); return(list(prob=prob, label=pred.label, features=selected.features)); } # random forest cross-validation with univariate t-test-based feature selection # Input: # data : a data frame or matrix with the data # y : a factor with the labels. 0:negative class, 1: positive class. # kk : number of folds (def: 5) # ntree : number of trees of the rf (def: 500) # mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features # seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed) # n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determned by the p.value parameter # p.value.thresh : Only features having p.value from the t-test lower than p.value.thresh are selected. Default value is 0.001. # model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved. # save.features : logical. If TRUE (default is FALSE), hte features selected at each fold of the CV are saved. # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # Output: # a vector with the cross-validated random forest probabilities. rf.ttest.cv <- function (data, y, kk=10, ntree=500, mtry=0, n.feature=0, p.value=0.001, seed = 0, model.file="", save.features=FALSE, verbose=FALSE) { MIN.SELECTED.FEATURES=5; n.data <- nrow(data); indices.positives <- which(y == 1) ; prob <- numeric(n.data); names(prob) <- rownames(data); rf.list <- vector(mode="list", kk); folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed); for (i in 1:kk) { # preparation of the training and test data ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]); ind.pool.pos <- integer(0); ind.pool.neg <- integer(0); for (j in 1:kk) if (j!=i) { ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]); ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]); } data.train <- data[c(ind.pool.pos, ind.pool.neg),]; y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg)))); # selection of the best features # check whether some features have 0 standard deviation xx <- apply(data.train,2,sd); which0 <- which(xx==0); if (length(which0)>0) data.train <- data.train[,-which0]; # computing t.test yy <- ifelse(y.train==1,1,0); n.feat <- ncol(data.train); res <- numeric(n.feat); names(res) <- colnames(data.train); res <- apply(data.train, 2, function(x) { t.test(x[yy==1], x[yy==0])$p.value }); #for (j in 1:n.feat) # res[j] <- t.test(data.train[,j][yy==1], data.train[,j][yy==0])$p.value; if (n.feature > 0) ind.selected <- order(res, decreasing=FALSE)[1:n.feature] else ind.selected <- which(res n.selected.features)) { mtry.curr <- round(sqrt(n.selected.features)); if (mtry.curr<1) mtry.curr <- 1; if (verbose) cat("Automatical selection of mtry = ", mtry.curr, "\n"); } rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr); rm(data.train); gc(); # test data.test <- data[ind.test,]; if (length(which0)>0) # removing features with sd=0 in the training set data.test <- data.test[,-which0]; if (verbose) cat("Starting test on Fold ", i, "...\n"); p <- predict(rf, data.test[, ind.selected], type="prob")[,"1"]; # computing AUROC and AUPRC on the current fold digits=4; lab=ifelse(y==1,1,0); lab <- lab[ind.test]; sscurves <- evalmod(scores = p, labels = lab); m<-attr(sscurves,"auc",exact=FALSE); AUPRC <- round(m[2,"aucs"],digits); AUROC <- round(m[1,"aucs"],digits); cat ("AUROC = ", AUROC, "\n"); cat ("AUPRC = ", AUPRC, "\n"); prob[ind.test] <- p; if (verbose) cat("End test on Fold ", i, "\n"); rm(data.test); if (model.file=="") rm(rf) else rf.list[[i]] <- rf; gc(); if (verbose) cat("Fold ", i, " done -----\n"); } if (model.file != "") save(rf.list, file=model.file); return(prob); } # random forest leave-one-out cross-validation with univariate correlation-based feature selection # Input: # data : a data frame or matrix with the data # y : a factor with the labels. 0:negative class, 1: positive class. # ntree : number of trees of the rf (def: 500) # mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features # n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determned by the corr parameter # corr : absolute value of the correlation to select the features. Only features having absolute correlation equal or larger than corr # are selected. Default value is 0.2. # model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved. # save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved (not implemented) # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # Output: # a list with two components: # prob: a vector with the cross-validated random forest probabilities. # pred : predicted classes (1 for positives 0 for negatives) rf.corr.loo <- function (data, y, ntree=500, mtry=0, n.feature=0, corr=0.2, model.file="", save.features=FALSE, verbose=FALSE) { n.data <- nrow(data); prob <- numeric(n.data); pred.label <- factor(character(n.data), levels=c("1","0")); names(prob) <- rownames(data); rf.list <- vector(mode="list", n.data); for (i in 1:n.data) { # preparation of the training and test data data.test <- data[i,]; data.test <- matrix(data.test, nrow=1); data.train <- data[-i,]; y.train <- y[-i]; # selection of the best features # check whether some features have 0 standard deviation xx <- apply(data.train,2,sd); which0 <- which(xx==0); if (length(which0)>0) { data.train <- data.train[,-which0]; data.test<-matrix(data.test[, -which0],nrow=1); } # computing correlation yy <- ifelse(y.train==1,1,0); yy <- matrix (yy,ncol=1); res <- cor(yy,data.train); SNP.names <- colnames(data.train); res <- as.numeric(res); names(res) <- SNP.names; if (n.feature > 0) ind.selected <- order(abs(res), decreasing=TRUE)[1:n.feature] else ind.selected <- c(which(res>=corr), which(res <= -corr)); # training if (verbose) cat("Starting training on Fold ", i, "...\n"); n.selected.features <- length(ind.selected); mtry.curr=mtry; if ((mtry == 0) || (mtry > n.selected.features)) { mtry.curr <- round(sqrt(n.selected.features)); if (mtry.curr<1) mtry.curr <- 1; if (verbose) cat("Automatical selection of mtry = ", mtry.curr, "\n"); } rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr); rm(data.train); gc(); # test if (verbose) cat("Starting test on Fold ", i, "...\n"); prob[i] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"]; pred.label[i] <- predict(rf, data.test[, ind.selected], type="response"); # computing AUROC and AUPRC on the current fold #digits=4; #lab=ifelse(y==1,1,0); #lab <- lab[ind.test]; #sscurves <- evalmod(scores = p, labels = lab); #m<-attr(sscurves,"auc",exact=FALSE); #AUPRC <- round(m[2,"aucs"],digits); #AUROC <- round(m[1,"aucs"],digits); #cat ("AUROC = ", AUROC, "\n"); #cat ("AUPRC = ", AUPRC, "\n"); if (verbose) { cat("Example ", i, " : label = ", 2-as.numeric(y[i]), ", label pred = ", 2-as.numeric(pred.label[i]), ", score = ", prob[i], "\n"); cat("End test on Fold ", i, "\n"); } rm(data.test); if (model.file=="") rm(rf) else rf.list[[i]] <- rf; gc(); if (verbose) cat("Fold ", i, " done -----\n"); } if (model.file != "") save(rf.list, file=model.file); return(list(prob=prob, label=pred.label)); } ###############################################################################################à ############################################################################################### # random forest cross-validation with embedded variable importance feature selection. # Selection of the features is performed by internal oob (oob on the training data). # Input: # data : a data frame or matrix with the data # y : a factor with the labels. 0:negative class, 1: positive class. # kk : number of folds (def: 10) # ntree : number of trees of the rf (def: 500) # mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features mutliplied by eta # eta : multiplicative factor for the default number of randomly selected features. # sel.type : integer. If 1 the mean decrease in accuracy is used, if 2 (def) the mean decrease in node impurity (Gini index) is used instead. # (1=mean decrease in accuracy, 2=mean decrease in node impurity) # n.feature : number of the features to be selected in the training set through the variable importance measure (def.= 1000) # seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed) # model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved. # save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved. # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # Output: # a list with three components: # prob: a vector with the cross-validated random forest probabilities. # label : predicted classes (1 for positives 0 for negatives) # features : a list with kk components. Each component is a numeric vector storing the indices of the features selected at each step of the CV rf.vi.oob.cv <- function (data, y, kk=10, ntree=500, mtry=0, eta=1, sel.type=2, n.feature=1000, seed = 0, model.file="", save.features=FALSE, verbose=FALSE) { n.data <- nrow(data); indices.positives <- which(y == 1) ; prob <- numeric(n.data); pred.label <- factor(character(n.data), levels=c("1","0")); names(prob) <- names(pred.label) <- rownames(data); rf.list <- vector(mode="list", kk); selected.features <- vector("list", kk); # list storing the indices of the features selected at each step of the CV folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed); for (i in 1:kk) { # preparation of the training and test data ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]); ind.pool.pos <- integer(0); ind.pool.neg <- integer(0); for (j in 1:kk) if (j!=i) { ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]); ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]); } data.train <- data[c(ind.pool.pos, ind.pool.neg),]; y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg)))); tot.feature <- ncol(data.train); # selection of the best features using variable importance measures # mtry setting for oob on the training set if (verbose) cat("Starting oob estimation on Fold ", i, "...\n"); mtry.curr=mtry; if ((mtry == 0) || (mtry > tot.feature)) { mtry.curr <- round(eta*sqrt(tot.feature)); if (mtry.curr<1) mtry.curr <- 1; if (verbose) cat("Automatical selection of mtry for oob selection = ", mtry.curr, "\n"); } rf.oob <- randomForest(data.train, y.train, ntree = ntree, mtry = mtry.curr, importance=T); # computing variable importance and selecting the top variables if (verbose) { cat("OOB error estimate = ", rf.oob$err[ntree,1], "\n\n"); cat("Computing variable importance on Fold ", i, "...\n"); } var.imp <- importance(rf.oob, type=sel.type); var.imp <- as.numeric(var.imp); sorted.indices <- order(var.imp,decreasing = T); indices.selected <- sorted.indices[1:n.feature]; data.train.sel <- data.train[, indices.selected]; rm(data.train, var.imp); gc(); if (save.features) selected.features[[i]] <- indices.selected; # training if (verbose) cat("Starting training on Fold ", i, "...\n"); n.selected.features <- length(indices.selected); # setting of mtry for training mtry.curr=mtry; if ((mtry == 0) || (mtry > n.selected.features)) { mtry.curr <- round(eta*sqrt(n.selected.features)); if (mtry.curr<1) mtry.curr <- 1; if (verbose) cat("Automatical selection of mtry = ", mtry.curr, "\n"); } rf <- randomForest(data.train.sel, y.train, ntree = ntree, mtry = mtry.curr); # test data.test <- data[ind.test, indices.selected]; #if (length(which0)>0) # removing features with sd=0 in the training set # data.test <- data.test[,-which0]; if (verbose) cat("Starting test on Fold ", i, "...\n"); prob[ind.test] <- predict(rf, data.test, type="prob")[,"1"]; pred.label[ind.test] <- predict(rf, data.test, type="response"); # computing AUROC and AUPRC on the current fold if (verbose) { digits=4; lab=ifelse(y==1,1,0); lab <- lab[ind.test]; sscurves <- evalmod(scores = prob[ind.test], labels = lab); m<-attr(sscurves,"auc",exact=FALSE); AUPRC <- round(m[2,"aucs"],digits); AUROC <- round(m[1,"aucs"],digits); cat ("AUROC = ", AUROC, "\n"); cat ("AUPRC = ", AUPRC, "\n"); } if (verbose) cat("End test on Fold ", i, "\n"); rm(data.test); if (model.file=="") rm(rf) else rf.list[[i]] <- rf; gc(); if (verbose) cat("Fold ", i, " done -----\n"); } if (model.file != "") save(rf.list, file=model.file); return(list(prob=prob, label=pred.label, features=selected.features)); } ####################################################################################### #### Stable feature selection methods ## ####################################################################################### # Stable selection of features with correlation based univariate method # The most selected features at each cross-validation are selected. # For the selection both training ad test data can be used. # If training data are used, at each iteration only 1 fold is left out; if test data are used only the left out fold is used at each iteration # Input: # data : a data frame or matrix with the data: rows are examples and columns the features # y : a factor with the labels. 0:negative class, 1: positive class. # kk : number of folds (def: 10) # rep : number of repetitions of the cross validation # top.feature : value indicating the number of top ranked features to be selected # train : boolean. If TRUE (def.) trainnig data are used for feature selection, otherwise test data (that is only the left out fold) # min.frequency : minimum frequency a feature needs to be selected (a value between 0 and 1, def=1) # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed) # Output: # a list woth two componenents: # ind.selected: a vector with the indices of the selected features # scores : a vector with the corresponiding scores stable.fs.corr <- function(data, y, kk=10, rep=1, top.feature=1000, min.frequency=1, seed = 0, train=TRUE, verbose=FALSE) { n.data <- nrow(data); n.features <- ncol(data); indices.positives <- which(y == 1) ; freq <- numeric(n.features); # vector collecting the number of times each feature is selected names(freq) <- colnames(data); for (r in 1:rep) { if (verbose) cat("Starting feature selection repetition ", r, "\n"); folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed+r-1); for (i in 1:kk) { # preparation of the training data ind.out <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]); if (train) { data.train <- data[- ind.out,]; y.train <- y[-ind.out]; } else { data.train <- data[ind.out,]; y.train <- y[ind.out]; } # selection of the best features through correlation if (verbose) cat("Starting feature selection on Fold ", i, "\n"); yy <- ifelse(y.train==1,1,0); yy <- matrix (yy,ncol=1); res <- cor(yy,data.train); ind.selected <- order(abs(res), decreasing=TRUE)[1:top.feature]; freq[ind.selected] <- freq[ind.selected] + 1; } } ind.selected <- which(freq >= (min.frequency*kk*rep)); names(ind.selected) <- names(freq)[ind.selected]; if (length(ind.selected) < 4) warning("Too few features selected \n"); if (verbose) cat ("Selected ", length(ind.selected), " features \n"); scores <- freq[ind.selected]/(kk*rep); names(scores) <- names(ind.selected); return(list(ind.selected=ind.selected, scores=scores)); } # random forest cross-validation with feature selection performed through "stable" selection of features with correlation based univariate methods # Input: # data : a data frame or matrix with the data # y : a factor with the labels. 0:negative class, 1: positive class. # kk : number of folds (def: 5) # rep : number of repetitions of the cross validation for the feature selection # ntree : number of trees of the rf (def: 500) # mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features mutliplied by eta # eta : multiplicative factor for the default number of randomly selected features. # seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed) # top.feature : value indicating the number of top ranked features to be selected # min.frequency : minimum frequency that a features needs to be selected (a value betwee 0 and 1, def.=1) # model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved. # train : boolean. If TRUE (def.) trainnig data are used for feature selection, otherwise test data (that is only the left out fold) # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # Output: # a list with three components: # prob: a vector with the cross-validated random forest probabilities. # label : predicted classes (1 for positives 0 for negatives) # features : a vector with the indices of the selected features rf.stable.fs.corr.cv <- function (data, y, kk=10, rep=1, ntree=500, mtry=0, eta=1, top.feature=1000, min.frequency=1, seed = 0, model.file="", train=TRUE, verbose=FALSE) { n.data <- nrow(data); indices.positives <- which(y == 1) ; prob <- numeric(n.data); pred.label <- factor(character(n.data), levels=c("1","0")); names(prob) <- names(pred.label) <- rownames(data); rf.list <- vector(mode="list", kk); # stable feature selection ind.selected <- stable.fs.corr(data, y, kk=kk, rep=rep, top.feature=top.feature, min.frequency=min.frequency, seed = seed, train=train, verbose=verbose); # cross-validation with the selected features folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed); for (i in 1:kk) { # preparation of the training and test data ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]); ind.pool.pos <- integer(0); ind.pool.neg <- integer(0); for (j in 1:kk) if (j!=i) { ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]); ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]); } data.train <- data[c(ind.pool.pos, ind.pool.neg),]; y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg)))); # training if (verbose) cat("Starting training on Fold ", i, "...\n"); n.selected.features <- length(ind.selected); mtry.curr=mtry; if ((mtry == 0) || (mtry > n.selected.features)) { mtry.curr <- round(eta*sqrt(n.selected.features)); if (mtry.curr<1) mtry.curr <- 1; if (verbose) cat("Automatic selection of mtry = ", mtry.curr, "\n"); } rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr); rm(data.train); gc(); # test data.test <- data[ind.test,]; #if (length(which0)>0) # removing features with sd=0 in the training set # data.test <- data.test[,-which0]; if (verbose) cat("Starting test on Fold ", i, "...\n"); prob[ind.test] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"]; pred.label[ind.test] <- predict(rf, data.test[, ind.selected], type="response"); # computing AUROC and AUPRC on the current fold if (verbose) { digits=4; lab=ifelse(y==1,1,0); lab <- lab[ind.test]; sscurves <- evalmod(scores = prob[ind.test], labels = lab); m<-attr(sscurves,"auc",exact=FALSE); AUPRC <- round(m[2,"aucs"],digits); AUROC <- round(m[1,"aucs"],digits); cat ("AUROC = ", AUROC, "\n"); cat ("AUPRC = ", AUPRC, "\n"); } if (verbose) cat("End test on Fold ", i, "\n"); rm(data.test); if (model.file=="") rm(rf) else rf.list[[i]] <- rf; gc(); if (verbose) cat("Fold ", i, " done -----\n"); } if (model.file != "") save(rf.list, file=model.file); return(list(prob=prob, label=pred.label, features=ind.selected)); } ####################################################################################### #### Permutation tests ## ####################################################################################### # Non parametric permutation test to asses the significance of the prediction # This function evaluates the statistical significance of the predictions through a non parameteric permutation of the labels. # Labels are permuted n times and each time the cross-validated accuracy and F-score performance are compared with the perfromance achieved with the true labels. # A p-value is computed as the frequency for which a prediction obtained with a random permutation is equal or better than the prediction obtained with the tru labels # Input: # data : a data frame or matrix with the data # y : a factor with the labels. 0:negative class, 1: positive class. # kk : number of folds (def: 5) # nperm : number of permutations # ntree : number of trees of the rf (def: 500) # mtry : number of the features to be randomly selected by the rf. If 0 (def) is the square root of the number of features # seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed) # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # Output: # a vector with two elements: the p.value computed with respect to the F-score and the p-value computed with respect to the accuracy. permutation.test.rf.cv <- function (data, y, kk=5, nperm=100, ntree=500, mtry=0, seed = 0, verbose=FALSE) { # computing the accuracy and F-score with the true labels true.prob <- rf.cv (data=data, y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE); pred=ifelse(true.prob>=0.5,1,0); lab=ifelse(y==1,1,0); res <- F.measure.single(pred,lab); true.F <- res["F"]; true.A <- res["A"]; if (verbose) cat("True prediction : ", res, "\n"); # computing the accuracy and F-score with the permuted labels perm.F <- perm.A <- numeric(nperm); for (i in 1:nperm) { y.perm <- sample(y); perm.prob <- rf.cv (data=data, y=y.perm, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE); pred=ifelse(perm.prob>=0.5,1,0); lab=ifelse(y.perm==1,1,0); res <- F.measure.single(pred,lab); perm.F[i] <- res["F"]; perm.A[i] <- res["A"]; if (verbose) cat("Permuted prediction ", i, " : ", res, "\n"); } p.value <- numeric(2); names(p.value) <- c("F", "A"); p.value["F"] <- sum(perm.F > true.F)/nperm; p.value["A"] <- sum(perm.A > true.A)/nperm; return(p.value); } # Non parametric permutation test to assess the significance of a given signature # This function evaluates the statistical significance of a given signature (subset of SNPs) # Given a signature S with cardinality n=|S|, a cross-validation is performed other m times using each time a set of SNPs S' with |S'| = n randomly extracted from the full set of the signature. # Then the cross-validated accuracy and F-score performance obtained with the random signatures are compared with the perfromance achieved with the true signature. # A p-value is computed as the frequency for which a prediction obtained with a random signature is equal or better than the prediction obtained with the true signature # Input: # data : a data frame or matrix with the data # y : a factor with the labels. 0:negative class, 1: positive class. # kk : number of folds (def: 5) # ind.signature: vector of the indices of the features (columns of data) corresponding to the true signature. # nrep : number of repetitions of the random signature # ntree : number of trees of the rf (def: 500) # mtry : number of the features to be randomly selected by the rf (def: square root of the number of features) # seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed) # verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE); # Output: # a vector with two elements: the p.value computed with respect to the F-score and the p-value computed with respect to the accuracy. signature.permutation.test.rf.cv <- function (data, y, kk=5, ind.signature, nrep=100, ntree=500, mtry=0, seed = 0, verbose=FALSE) { # computing the accuracy and F-score with the true signature data.true <- data[, ind.signature]; true.prob <- rf.cv (data=data.true, y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE); pred=ifelse(true.prob>=0.5,1,0); lab=ifelse(y==1,1,0); res <- F.measure.single(pred,lab); true.F <- res["F"]; true.A <- res["A"]; if (verbose) cat("True signature prediction : ", res, "\n"); # computing the accuracy and F-score with the random signatures n.feat <- ncol(data); n.feat.sign <- length(ind.signature); perm.F <- perm.A <- numeric(nrep); for (i in 1:nrep) { rand.sign <- sample(1:n.feat, n.feat.sign); rand.prob <- rf.cv (data=data[,rand.sign], y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE); pred=ifelse(rand.prob>=0.5,1,0); res <- F.measure.single(pred,lab); perm.F[i] <- res["F"]; perm.A[i] <- res["A"]; if (verbose) cat("Random signature prediction ", i, " : ", res, "\n"); } p.value <- numeric(2); names(p.value) <- c("F", "A"); p.value["F"] <- sum(perm.F > true.F)/nrep; p.value["A"] <- sum(perm.A > true.A)/nrep; return(p.value); } ################################# # Utility functions ################################# ###################################################################### # Function to generate data for the stratified cross-validation. # Input: # examples : indices of the examples (a vector of integer) # positives: vector of integer. Indices of the positive examples. The indices refer to the indices of examples # k : number of folds (def = 5) # seed : seed of the random generator (def=0). If is set to 0 no initiazitation is performed # Ouptut: # a list with 2 two components # - fold.non.positives : a list with k components. Each component is a vector with the indices of the non positive elements of the fold # - fold.positives : a list with k components. Each component is a vector with the indices of the positive elements of the fold # N.B.: in both elements indices refer to the values of the examples vector do.stratified.cv.data <- function(examples, positives, k=5, seed=NULL) { if (seed!=0) set.seed(seed); fold.non.positives <- fold.positives <- list(); for (i in 1:k) { fold.non.positives[[i]] <- integer(0); fold.positives[[i]] <- integer(0); } # examples <- 1:n; non.positives <- setdiff(examples,positives); # non.positives <- examples[-positives]; non.positives <- sample(non.positives); positives <- sample(positives); n.positives <- length(positives); resto.positives <- n.positives%%k; n.pos.per.fold <- (n.positives - resto.positives)/k; n.non.positives <- length(non.positives); resto.non.positives <- n.non.positives%%k; n.non.pos.per.fold <- (n.non.positives - resto.non.positives)/k; j=1; if (n.non.pos.per.fold > 0) for (i in 1:k) { fold.non.positives[[i]] <- non.positives[j:(j+n.non.pos.per.fold-1)]; j <- j + n.non.pos.per.fold; } j.pos=1; if (n.pos.per.fold > 0) for (i in 1:k) { fold.positives[[i]] <- positives[j.pos:(j.pos+n.pos.per.fold-1)]; j.pos <- j.pos + n.pos.per.fold; } if (resto.non.positives > 0) for (i in k:(k-resto.non.positives+1)) { fold.non.positives[[i]] <- c(fold.non.positives[[i]], non.positives[j]); j <- j + 1; } if (resto.positives > 0) for (i in 1:resto.positives) { fold.positives[[i]] <- c(fold.positives[[i]], positives[j.pos]); j.pos <- j.pos + 1; } return(list(fold.non.positives=fold.non.positives, fold.positives=fold.positives)); } # Function to compute the F-measure for a single class # Input # pred : factor or vector of the predicted labels (1 stands for positve, 0 for negative) # labels : factor or vector of the true labels (1 stands for positve, 0 for negative) # Note that 0 level stands for negative and 1 for positive. # In general the first level is negative and the second positive # Output: # a named vector with five elements: # P: precision # R : recall (sensitivity) # S : specificity # F : F measure # A : 0/1 accuracy F.measure.single <- function(pred,labels) { if (length(pred)!=length(labels)) stop("F.measure: lengths of true and predicted labels do not match."); res <- numeric(5); names(res) <- c("P", "R", "S", "F", "A"); pred.0 <- which(pred==0); pred.1 <- which(pred==1); labels.0<- which(labels==0); labels.1<- which(labels==1); TN <- length(intersect(pred.0, labels.0)); TP <- length(intersect(pred.1, labels.1)); FN <- length(intersect(pred.0, labels.1)); FP <- length(intersect(pred.1, labels.0)); acc <- (TP+TN)/length(labels); if ((TP+FP) == 0) precision <- 0 else precision <- TP/(TP+FP); if ((TP+FN) == 0) recall <- 0 else recall <- TP/(TP+FN); if ((TN+FP) == 0) specificity <- 0 else specificity <- TN/(TN+FP); if ((precision+recall) == 0) F <- 0 else F = 2 *(precision*recall) / (precision+recall); res <- c(precision,recall,specificity,F,acc); names(res) <- c("P", "R", "S", "F", "A"); return (res); } # Function to compute the best F score for a single class. # Input: # scores : vector of the predicted scores # lab : vector of the true labels: 1 positive, 0 negative. # Output: # a named vector with six elements: # P: precision # R : recall (sensitivity) # S : specificity # F : F measure # A : 0/1 accuracy # T : best threshold Fmax <- function (scores, lab) { max.F=0; best=0; unique.scores <- unique(scores); for (t in unique.scores) { pred <- ifelse(scores>=t,1,0); res <- F.measure.single(pred,lab); if (res["F"] > max.F) { max.F <- res["F"]; best <- c(res,t); } } names(best)[length(best)] <- "T"; return(best); } # Function to compute the best Accruacy for a single class. # Input: # scores : vector of the predicted scores # lab : vector of the true labels: 1 positive, 0 negative. # Output: # a named vector with six elements: # P: precision # R : recall (sensitivity) # S : specificity # F : F measure # A : 0/1 accuracy # T : best threshold Accmax <- function (scores, lab) { max.Acc=0; best=0; unique.scores <- unique(scores); for (t in unique.scores) { pred <- ifelse(scores>=t,1,0); res <- F.measure.single(pred,lab); if (res["A"] > max.Acc) { max.Acc <- res["A"]; best <- c(res,t); } } names(best)[length(best)] <- "T"; return(best); }