# Random forest library for processing genetic data
# May 2016
# October 2016
# March 2017
# Last updated 2021
library(randomForest);
# random forest cross-validation
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the features.
# seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed)
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a vector with the cross-validated random forest probabilities.
rf.cv <- function (data, y, kk=5, ntree=500, mtry=round(sqrt(ncol(data))), seed = 0, model.file="", verbose=FALSE) {
n.data <- nrow(data);
n.col <- ncol(data);
indices.positives <- which(y == 1) ;
prob <- numeric(n.data);
names(prob) <- rownames(data);
rf.list <- vector(mode="list", kk);
folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);
for (i in 1:kk) {
# preparation of the training and test data
ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
ind.pool.pos <- integer(0);
ind.pool.neg <- integer(0);
for (j in 1:kk)
if (j!=i) {
ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
}
data.train <- data[c(ind.pool.pos, ind.pool.neg),];
y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
mtry.curr=mtry;
if ((mtry == 0) || (mtry > n.col)) {
mtry.curr <- round(sqrt(n.col));
if (mtry.curr<1)
mtry.curr <- 1;
if (verbose)
cat("Automatic selection of mtry = ", mtry.curr, "\n");
}
# training
if (verbose)
cat("Starting training on Fold ", i, "...\n");
rf <- randomForest(data.train, y.train, ntree = ntree, mtry = mtry.curr);
rm(data.train); gc();
# test
data.test <- data[ind.test,];
if (verbose)
cat("Starting test on Fold ", i, "...\n");
prob[ind.test] <- predict(rf, data.test, type="prob")[,"1"];
if (verbose)
cat("End test on Fold ", i, "\n");
rm(data.test);
if (model.file=="")
rm(rf)
else
rf.list[[i]] <- rf;
gc();
if (verbose)
cat("Fold ", i, " done -----\n");
}
if (model.file != "")
save(rf.list, file=model.file);
return(prob);
}
# random forest cross-validation with univariate correlation-based feature selection
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features mutliplied by eta
# eta : multiplicative factor for the default number of tandomly selected features.
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determined by the corr parameter
# corr : absolute value of the correlation to seelct the features. Only features having absolute correlation equal or larger than corr
# are selected. Default value is 0.2.
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved.
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a list with three components:
# prob: a vector with the cross-validated random forest probabilities.
# label : predicted classes (1 for positives 0 for negatives)
# features : a list with kk components. Each component is a numeric vector storing the indices of the features selected at each step of the CV
rf.corr.cv <- function (data, y, kk=10, ntree=500, mtry=0, eta=1, n.feature=0, corr=0.2, seed = 0, model.file="",
save.features=FALSE, verbose=FALSE) {
n.data <- nrow(data);
indices.positives <- which(y == 1) ;
prob <- numeric(n.data);
pred.label <- factor(character(n.data), levels=c("1","0"));
names(prob) <- names(pred.label) <- rownames(data);
rf.list <- vector(mode="list", kk);
selected.features <- vector("list", kk); # list storing the indices of the features selected at each step of the CV
folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);
# find features with sd=0 and remove them from the data
xx <- apply(data,2,sd);
which0 <- which(xx==0);
if (length(which0)>0)
data <- data[,-which0];
rm(xx);
for (i in 1:kk) {
# preparation of the training and test data
ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
ind.pool.pos <- integer(0);
ind.pool.neg <- integer(0);
for (j in 1:kk)
if (j!=i) {
ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
}
data.train <- data[c(ind.pool.pos, ind.pool.neg),];
y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
# selection of the best features
# check whether some features have 0 standard deviation
#if (length(which0)>0)
# data.train <- data.train[,-which0];
# computing correlation
yy <- ifelse(y.train==1,1,0);
yy <- matrix (yy,ncol=1);
res <- cor(yy,data.train);
SNP.names <- colnames(data.train);
res <- as.numeric(res);
names(res) <- SNP.names;
if (n.feature > 0)
ind.selected <- order(abs(res), decreasing=TRUE)[1:n.feature]
else
ind.selected <- c(which(res>=corr), which(res <= -corr));
if (save.features)
selected.features[[i]] <- ind.selected;
# training
if (verbose)
cat("Starting training on Fold ", i, "...\n");
n.selected.features <- length(ind.selected);
mtry.curr=mtry;
if ((mtry == 0) || (mtry > n.selected.features)) {
mtry.curr <- round(eta*sqrt(n.selected.features));
if (mtry.curr<1)
mtry.curr <- 1;
if (verbose)
cat("Automatical selection of mtry = ", mtry.curr, "\n");
}
rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr);
rm(data.train); gc();
# test
data.test <- data[ind.test,];
#if (length(which0)>0) # removing features with sd=0 in the training set
# data.test <- data.test[,-which0];
if (verbose)
cat("Starting test on Fold ", i, "...\n");
prob[ind.test] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"];
pred.label[ind.test] <- predict(rf, data.test[, ind.selected], type="response");
# computing AUROC and AUPRC on the current fold
if (verbose) {
digits=4;
lab=ifelse(y==1,1,0);
lab <- lab[ind.test];
sscurves <- evalmod(scores = prob[ind.test], labels = lab);
m<-attr(sscurves,"auc",exact=FALSE);
AUPRC <- round(m[2,"aucs"],digits);
AUROC <- round(m[1,"aucs"],digits);
cat ("AUROC = ", AUROC, "\n");
cat ("AUPRC = ", AUPRC, "\n");
}
if (verbose)
cat("End test on Fold ", i, "\n");
rm(data.test);
if (model.file=="")
rm(rf)
else
rf.list[[i]] <- rf;
gc();
if (verbose)
cat("Fold ", i, " done -----\n");
}
if (model.file != "")
save(rf.list, file=model.file);
return(list(prob=prob, label=pred.label, features=selected.features));
}
# random forest cross-validation with univariate t-test-based feature selection
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determned by the p.value parameter
# p.value.thresh : Only features having p.value from the t-test lower than p.value.thresh are selected. Default value is 0.001.
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# save.features : logical. If TRUE (default is FALSE), hte features selected at each fold of the CV are saved.
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a vector with the cross-validated random forest probabilities.
rf.ttest.cv <- function (data, y, kk=10, ntree=500, mtry=0, n.feature=0, p.value=0.001, seed = 0, model.file="",
save.features=FALSE, verbose=FALSE) {
MIN.SELECTED.FEATURES=5;
n.data <- nrow(data);
indices.positives <- which(y == 1) ;
prob <- numeric(n.data);
names(prob) <- rownames(data);
rf.list <- vector(mode="list", kk);
folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);
for (i in 1:kk) {
# preparation of the training and test data
ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
ind.pool.pos <- integer(0);
ind.pool.neg <- integer(0);
for (j in 1:kk)
if (j!=i) {
ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
}
data.train <- data[c(ind.pool.pos, ind.pool.neg),];
y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
# selection of the best features
# check whether some features have 0 standard deviation
xx <- apply(data.train,2,sd);
which0 <- which(xx==0);
if (length(which0)>0)
data.train <- data.train[,-which0];
# computing t.test
yy <- ifelse(y.train==1,1,0);
n.feat <- ncol(data.train);
res <- numeric(n.feat);
names(res) <- colnames(data.train);
res <- apply(data.train, 2, function(x) { t.test(x[yy==1], x[yy==0])$p.value });
#for (j in 1:n.feat)
# res[j] <- t.test(data.train[,j][yy==1], data.train[,j][yy==0])$p.value;
if (n.feature > 0)
ind.selected <- order(res, decreasing=FALSE)[1:n.feature]
else
ind.selected <- which(res
n.selected.features)) {
mtry.curr <- round(sqrt(n.selected.features));
if (mtry.curr<1)
mtry.curr <- 1;
if (verbose)
cat("Automatical selection of mtry = ", mtry.curr, "\n");
}
rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr);
rm(data.train); gc();
# test
data.test <- data[ind.test,];
if (length(which0)>0) # removing features with sd=0 in the training set
data.test <- data.test[,-which0];
if (verbose)
cat("Starting test on Fold ", i, "...\n");
p <- predict(rf, data.test[, ind.selected], type="prob")[,"1"];
# computing AUROC and AUPRC on the current fold
digits=4;
lab=ifelse(y==1,1,0);
lab <- lab[ind.test];
sscurves <- evalmod(scores = p, labels = lab);
m<-attr(sscurves,"auc",exact=FALSE);
AUPRC <- round(m[2,"aucs"],digits);
AUROC <- round(m[1,"aucs"],digits);
cat ("AUROC = ", AUROC, "\n");
cat ("AUPRC = ", AUPRC, "\n");
prob[ind.test] <- p;
if (verbose)
cat("End test on Fold ", i, "\n");
rm(data.test);
if (model.file=="")
rm(rf)
else
rf.list[[i]] <- rf;
gc();
if (verbose)
cat("Fold ", i, " done -----\n");
}
if (model.file != "")
save(rf.list, file=model.file);
return(prob);
}
# random forest leave-one-out cross-validation with univariate correlation-based feature selection
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:negative class, 1: positive class.
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features
# n.feature : number of the features to be selected in the training set. If 0 (def) the number of features is determned by the corr parameter
# corr : absolute value of the correlation to select the features. Only features having absolute correlation equal or larger than corr
# are selected. Default value is 0.2.
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved (not implemented)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a list with two components:
# prob: a vector with the cross-validated random forest probabilities.
# pred : predicted classes (1 for positives 0 for negatives)
rf.corr.loo <- function (data, y, ntree=500, mtry=0, n.feature=0, corr=0.2, model.file="",
save.features=FALSE, verbose=FALSE) {
n.data <- nrow(data);
prob <- numeric(n.data);
pred.label <- factor(character(n.data), levels=c("1","0"));
names(prob) <- rownames(data);
rf.list <- vector(mode="list", n.data);
for (i in 1:n.data) {
# preparation of the training and test data
data.test <- data[i,];
data.test <- matrix(data.test, nrow=1);
data.train <- data[-i,];
y.train <- y[-i];
# selection of the best features
# check whether some features have 0 standard deviation
xx <- apply(data.train,2,sd);
which0 <- which(xx==0);
if (length(which0)>0) {
data.train <- data.train[,-which0];
data.test<-matrix(data.test[, -which0],nrow=1);
}
# computing correlation
yy <- ifelse(y.train==1,1,0);
yy <- matrix (yy,ncol=1);
res <- cor(yy,data.train);
SNP.names <- colnames(data.train);
res <- as.numeric(res);
names(res) <- SNP.names;
if (n.feature > 0)
ind.selected <- order(abs(res), decreasing=TRUE)[1:n.feature]
else
ind.selected <- c(which(res>=corr), which(res <= -corr));
# training
if (verbose)
cat("Starting training on Fold ", i, "...\n");
n.selected.features <- length(ind.selected);
mtry.curr=mtry;
if ((mtry == 0) || (mtry > n.selected.features)) {
mtry.curr <- round(sqrt(n.selected.features));
if (mtry.curr<1)
mtry.curr <- 1;
if (verbose)
cat("Automatical selection of mtry = ", mtry.curr, "\n");
}
rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr);
rm(data.train); gc();
# test
if (verbose)
cat("Starting test on Fold ", i, "...\n");
prob[i] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"];
pred.label[i] <- predict(rf, data.test[, ind.selected], type="response");
# computing AUROC and AUPRC on the current fold
#digits=4;
#lab=ifelse(y==1,1,0);
#lab <- lab[ind.test];
#sscurves <- evalmod(scores = p, labels = lab);
#m<-attr(sscurves,"auc",exact=FALSE);
#AUPRC <- round(m[2,"aucs"],digits);
#AUROC <- round(m[1,"aucs"],digits);
#cat ("AUROC = ", AUROC, "\n");
#cat ("AUPRC = ", AUPRC, "\n");
if (verbose) {
cat("Example ", i, " : label = ", 2-as.numeric(y[i]), ", label pred = ", 2-as.numeric(pred.label[i]), ", score = ", prob[i], "\n");
cat("End test on Fold ", i, "\n");
}
rm(data.test);
if (model.file=="")
rm(rf)
else
rf.list[[i]] <- rf;
gc();
if (verbose)
cat("Fold ", i, " done -----\n");
}
if (model.file != "")
save(rf.list, file=model.file);
return(list(prob=prob, label=pred.label));
}
###############################################################################################à
###############################################################################################
# random forest cross-validation with embedded variable importance feature selection.
# Selection of the features is performed by internal oob (oob on the training data).
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 10)
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features mutliplied by eta
# eta : multiplicative factor for the default number of randomly selected features.
# sel.type : integer. If 1 the mean decrease in accuracy is used, if 2 (def) the mean decrease in node impurity (Gini index) is used instead.
# (1=mean decrease in accuracy, 2=mean decrease in node impurity)
# n.feature : number of the features to be selected in the training set through the variable importance measure (def.= 1000)
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# save.features : logical. If TRUE (default is FALSE), the features selected at each fold of the CV are saved.
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a list with three components:
# prob: a vector with the cross-validated random forest probabilities.
# label : predicted classes (1 for positives 0 for negatives)
# features : a list with kk components. Each component is a numeric vector storing the indices of the features selected at each step of the CV
rf.vi.oob.cv <- function (data, y, kk=10, ntree=500, mtry=0, eta=1, sel.type=2, n.feature=1000, seed = 0, model.file="",
save.features=FALSE, verbose=FALSE) {
n.data <- nrow(data);
indices.positives <- which(y == 1) ;
prob <- numeric(n.data);
pred.label <- factor(character(n.data), levels=c("1","0"));
names(prob) <- names(pred.label) <- rownames(data);
rf.list <- vector(mode="list", kk);
selected.features <- vector("list", kk); # list storing the indices of the features selected at each step of the CV
folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);
for (i in 1:kk) {
# preparation of the training and test data
ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
ind.pool.pos <- integer(0);
ind.pool.neg <- integer(0);
for (j in 1:kk)
if (j!=i) {
ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
}
data.train <- data[c(ind.pool.pos, ind.pool.neg),];
y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
tot.feature <- ncol(data.train);
# selection of the best features using variable importance measures
# mtry setting for oob on the training set
if (verbose)
cat("Starting oob estimation on Fold ", i, "...\n");
mtry.curr=mtry;
if ((mtry == 0) || (mtry > tot.feature)) {
mtry.curr <- round(eta*sqrt(tot.feature));
if (mtry.curr<1)
mtry.curr <- 1;
if (verbose)
cat("Automatical selection of mtry for oob selection = ", mtry.curr, "\n");
}
rf.oob <- randomForest(data.train, y.train, ntree = ntree, mtry = mtry.curr, importance=T);
# computing variable importance and selecting the top variables
if (verbose) {
cat("OOB error estimate = ", rf.oob$err[ntree,1], "\n\n");
cat("Computing variable importance on Fold ", i, "...\n");
}
var.imp <- importance(rf.oob, type=sel.type);
var.imp <- as.numeric(var.imp);
sorted.indices <- order(var.imp,decreasing = T);
indices.selected <- sorted.indices[1:n.feature];
data.train.sel <- data.train[, indices.selected];
rm(data.train, var.imp); gc();
if (save.features)
selected.features[[i]] <- indices.selected;
# training
if (verbose)
cat("Starting training on Fold ", i, "...\n");
n.selected.features <- length(indices.selected);
# setting of mtry for training
mtry.curr=mtry;
if ((mtry == 0) || (mtry > n.selected.features)) {
mtry.curr <- round(eta*sqrt(n.selected.features));
if (mtry.curr<1)
mtry.curr <- 1;
if (verbose)
cat("Automatical selection of mtry = ", mtry.curr, "\n");
}
rf <- randomForest(data.train.sel, y.train, ntree = ntree, mtry = mtry.curr);
# test
data.test <- data[ind.test, indices.selected];
#if (length(which0)>0) # removing features with sd=0 in the training set
# data.test <- data.test[,-which0];
if (verbose)
cat("Starting test on Fold ", i, "...\n");
prob[ind.test] <- predict(rf, data.test, type="prob")[,"1"];
pred.label[ind.test] <- predict(rf, data.test, type="response");
# computing AUROC and AUPRC on the current fold
if (verbose) {
digits=4;
lab=ifelse(y==1,1,0);
lab <- lab[ind.test];
sscurves <- evalmod(scores = prob[ind.test], labels = lab);
m<-attr(sscurves,"auc",exact=FALSE);
AUPRC <- round(m[2,"aucs"],digits);
AUROC <- round(m[1,"aucs"],digits);
cat ("AUROC = ", AUROC, "\n");
cat ("AUPRC = ", AUPRC, "\n");
}
if (verbose)
cat("End test on Fold ", i, "\n");
rm(data.test);
if (model.file=="")
rm(rf)
else
rf.list[[i]] <- rf;
gc();
if (verbose)
cat("Fold ", i, " done -----\n");
}
if (model.file != "")
save(rf.list, file=model.file);
return(list(prob=prob, label=pred.label, features=selected.features));
}
#######################################################################################
#### Stable feature selection methods ##
#######################################################################################
# Stable selection of features with correlation based univariate method
# The most selected features at each cross-validation are selected.
# For the selection both training ad test data can be used.
# If training data are used, at each iteration only 1 fold is left out; if test data are used only the left out fold is used at each iteration
# Input:
# data : a data frame or matrix with the data: rows are examples and columns the features
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 10)
# rep : number of repetitions of the cross validation
# top.feature : value indicating the number of top ranked features to be selected
# train : boolean. If TRUE (def.) trainnig data are used for feature selection, otherwise test data (that is only the left out fold)
# min.frequency : minimum frequency a feature needs to be selected (a value between 0 and 1, def=1)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# Output:
# a list woth two componenents:
# ind.selected: a vector with the indices of the selected features
# scores : a vector with the corresponiding scores
stable.fs.corr <- function(data, y, kk=10, rep=1, top.feature=1000, min.frequency=1, seed = 0, train=TRUE, verbose=FALSE) {
n.data <- nrow(data);
n.features <- ncol(data);
indices.positives <- which(y == 1) ;
freq <- numeric(n.features); # vector collecting the number of times each feature is selected
names(freq) <- colnames(data);
for (r in 1:rep) {
if (verbose)
cat("Starting feature selection repetition ", r, "\n");
folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed+r-1);
for (i in 1:kk) {
# preparation of the training data
ind.out <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
if (train) {
data.train <- data[- ind.out,];
y.train <- y[-ind.out];
} else {
data.train <- data[ind.out,];
y.train <- y[ind.out];
}
# selection of the best features through correlation
if (verbose)
cat("Starting feature selection on Fold ", i, "\n");
yy <- ifelse(y.train==1,1,0);
yy <- matrix (yy,ncol=1);
res <- cor(yy,data.train);
ind.selected <- order(abs(res), decreasing=TRUE)[1:top.feature];
freq[ind.selected] <- freq[ind.selected] + 1;
}
}
ind.selected <- which(freq >= (min.frequency*kk*rep));
names(ind.selected) <- names(freq)[ind.selected];
if (length(ind.selected) < 4)
warning("Too few features selected \n");
if (verbose)
cat ("Selected ", length(ind.selected), " features \n");
scores <- freq[ind.selected]/(kk*rep);
names(scores) <- names(ind.selected);
return(list(ind.selected=ind.selected, scores=scores));
}
# random forest cross-validation with feature selection performed through "stable" selection of features with correlation based univariate methods
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# rep : number of repetitions of the cross validation for the feature selection
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf, If 0 (def) it is the square root of the number of the selected features mutliplied by eta
# eta : multiplicative factor for the default number of randomly selected features.
# seed : initialization seed for the random generator ( if set to 0 (def.) no inizialization is performed)
# top.feature : value indicating the number of top ranked features to be selected
# min.frequency : minimum frequency that a features needs to be selected (a value betwee 0 and 1, def.=1)
# model.file : name of the file where the cross-validated random forest models will be saved. If file=="" (def.) no model is saved.
# train : boolean. If TRUE (def.) trainnig data are used for feature selection, otherwise test data (that is only the left out fold)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a list with three components:
# prob: a vector with the cross-validated random forest probabilities.
# label : predicted classes (1 for positives 0 for negatives)
# features : a vector with the indices of the selected features
rf.stable.fs.corr.cv <- function (data, y, kk=10, rep=1, ntree=500, mtry=0, eta=1, top.feature=1000, min.frequency=1, seed = 0, model.file="", train=TRUE, verbose=FALSE) {
n.data <- nrow(data);
indices.positives <- which(y == 1) ;
prob <- numeric(n.data);
pred.label <- factor(character(n.data), levels=c("1","0"));
names(prob) <- names(pred.label) <- rownames(data);
rf.list <- vector(mode="list", kk);
# stable feature selection
ind.selected <- stable.fs.corr(data, y, kk=kk, rep=rep, top.feature=top.feature, min.frequency=min.frequency, seed = seed, train=train, verbose=verbose);
# cross-validation with the selected features
folds <- do.stratified.cv.data(1:n.data, indices.positives, k=kk, seed=seed);
for (i in 1:kk) {
# preparation of the training and test data
ind.test <- c(folds$fold.positives[[i]], folds$fold.non.positives[[i]]);
ind.pool.pos <- integer(0);
ind.pool.neg <- integer(0);
for (j in 1:kk)
if (j!=i) {
ind.pool.pos <- c(ind.pool.pos, folds$fold.positives[[j]]);
ind.pool.neg <- c(ind.pool.neg, folds$fold.non.positives[[j]]);
}
data.train <- data[c(ind.pool.pos, ind.pool.neg),];
y.train <- as.factor(c(rep(1, length(ind.pool.pos)), rep (0, length(ind.pool.neg))));
# training
if (verbose)
cat("Starting training on Fold ", i, "...\n");
n.selected.features <- length(ind.selected);
mtry.curr=mtry;
if ((mtry == 0) || (mtry > n.selected.features)) {
mtry.curr <- round(eta*sqrt(n.selected.features));
if (mtry.curr<1)
mtry.curr <- 1;
if (verbose)
cat("Automatic selection of mtry = ", mtry.curr, "\n");
}
rf <- randomForest(data.train[, ind.selected], y.train, ntree = ntree, mtry = mtry.curr);
rm(data.train); gc();
# test
data.test <- data[ind.test,];
#if (length(which0)>0) # removing features with sd=0 in the training set
# data.test <- data.test[,-which0];
if (verbose)
cat("Starting test on Fold ", i, "...\n");
prob[ind.test] <- predict(rf, data.test[, ind.selected], type="prob")[,"1"];
pred.label[ind.test] <- predict(rf, data.test[, ind.selected], type="response");
# computing AUROC and AUPRC on the current fold
if (verbose) {
digits=4;
lab=ifelse(y==1,1,0);
lab <- lab[ind.test];
sscurves <- evalmod(scores = prob[ind.test], labels = lab);
m<-attr(sscurves,"auc",exact=FALSE);
AUPRC <- round(m[2,"aucs"],digits);
AUROC <- round(m[1,"aucs"],digits);
cat ("AUROC = ", AUROC, "\n");
cat ("AUPRC = ", AUPRC, "\n");
}
if (verbose)
cat("End test on Fold ", i, "\n");
rm(data.test);
if (model.file=="")
rm(rf)
else
rf.list[[i]] <- rf;
gc();
if (verbose)
cat("Fold ", i, " done -----\n");
}
if (model.file != "")
save(rf.list, file=model.file);
return(list(prob=prob, label=pred.label, features=ind.selected));
}
#######################################################################################
#### Permutation tests ##
#######################################################################################
# Non parametric permutation test to asses the significance of the prediction
# This function evaluates the statistical significance of the predictions through a non parameteric permutation of the labels.
# Labels are permuted n times and each time the cross-validated accuracy and F-score performance are compared with the perfromance achieved with the true labels.
# A p-value is computed as the frequency for which a prediction obtained with a random permutation is equal or better than the prediction obtained with the tru labels
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# nperm : number of permutations
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf. If 0 (def) is the square root of the number of features
# seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a vector with two elements: the p.value computed with respect to the F-score and the p-value computed with respect to the accuracy.
permutation.test.rf.cv <- function (data, y, kk=5, nperm=100, ntree=500, mtry=0, seed = 0, verbose=FALSE) {
# computing the accuracy and F-score with the true labels
true.prob <- rf.cv (data=data, y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE);
pred=ifelse(true.prob>=0.5,1,0);
lab=ifelse(y==1,1,0);
res <- F.measure.single(pred,lab);
true.F <- res["F"];
true.A <- res["A"];
if (verbose)
cat("True prediction : ", res, "\n");
# computing the accuracy and F-score with the permuted labels
perm.F <- perm.A <- numeric(nperm);
for (i in 1:nperm) {
y.perm <- sample(y);
perm.prob <- rf.cv (data=data, y=y.perm, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE);
pred=ifelse(perm.prob>=0.5,1,0);
lab=ifelse(y.perm==1,1,0);
res <- F.measure.single(pred,lab);
perm.F[i] <- res["F"];
perm.A[i] <- res["A"];
if (verbose)
cat("Permuted prediction ", i, " : ", res, "\n");
}
p.value <- numeric(2);
names(p.value) <- c("F", "A");
p.value["F"] <- sum(perm.F > true.F)/nperm;
p.value["A"] <- sum(perm.A > true.A)/nperm;
return(p.value);
}
# Non parametric permutation test to assess the significance of a given signature
# This function evaluates the statistical significance of a given signature (subset of SNPs)
# Given a signature S with cardinality n=|S|, a cross-validation is performed other m times using each time a set of SNPs S' with |S'| = n randomly extracted from the full set of the signature.
# Then the cross-validated accuracy and F-score performance obtained with the random signatures are compared with the perfromance achieved with the true signature.
# A p-value is computed as the frequency for which a prediction obtained with a random signature is equal or better than the prediction obtained with the true signature
# Input:
# data : a data frame or matrix with the data
# y : a factor with the labels. 0:negative class, 1: positive class.
# kk : number of folds (def: 5)
# ind.signature: vector of the indices of the features (columns of data) corresponding to the true signature.
# nrep : number of repetitions of the random signature
# ntree : number of trees of the rf (def: 500)
# mtry : number of the features to be randomly selected by the rf (def: square root of the number of features)
# seed : initialization seed for the random generator ( if set to 0(def.) no inizialization is performed)
# verbose : logical. If TRUE messages about the status of the cross validation are displayed (def: FALSE);
# Output:
# a vector with two elements: the p.value computed with respect to the F-score and the p-value computed with respect to the accuracy.
signature.permutation.test.rf.cv <- function (data, y, kk=5, ind.signature, nrep=100, ntree=500, mtry=0, seed = 0, verbose=FALSE) {
# computing the accuracy and F-score with the true signature
data.true <- data[, ind.signature];
true.prob <- rf.cv (data=data.true, y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE);
pred=ifelse(true.prob>=0.5,1,0);
lab=ifelse(y==1,1,0);
res <- F.measure.single(pred,lab);
true.F <- res["F"];
true.A <- res["A"];
if (verbose)
cat("True signature prediction : ", res, "\n");
# computing the accuracy and F-score with the random signatures
n.feat <- ncol(data);
n.feat.sign <- length(ind.signature);
perm.F <- perm.A <- numeric(nrep);
for (i in 1:nrep) {
rand.sign <- sample(1:n.feat, n.feat.sign);
rand.prob <- rf.cv (data=data[,rand.sign], y=y, kk=kk, ntree=ntree, mtry=mtry, seed = seed, verbose=FALSE);
pred=ifelse(rand.prob>=0.5,1,0);
res <- F.measure.single(pred,lab);
perm.F[i] <- res["F"];
perm.A[i] <- res["A"];
if (verbose)
cat("Random signature prediction ", i, " : ", res, "\n");
}
p.value <- numeric(2);
names(p.value) <- c("F", "A");
p.value["F"] <- sum(perm.F > true.F)/nrep;
p.value["A"] <- sum(perm.A > true.A)/nrep;
return(p.value);
}
#################################
# Utility functions
#################################
######################################################################
# Function to generate data for the stratified cross-validation.
# Input:
# examples : indices of the examples (a vector of integer)
# positives: vector of integer. Indices of the positive examples. The indices refer to the indices of examples
# k : number of folds (def = 5)
# seed : seed of the random generator (def=0). If is set to 0 no initiazitation is performed
# Ouptut:
# a list with 2 two components
# - fold.non.positives : a list with k components. Each component is a vector with the indices of the non positive elements of the fold
# - fold.positives : a list with k components. Each component is a vector with the indices of the positive elements of the fold
# N.B.: in both elements indices refer to the values of the examples vector
do.stratified.cv.data <- function(examples, positives, k=5, seed=NULL) {
if (seed!=0)
set.seed(seed);
fold.non.positives <- fold.positives <- list();
for (i in 1:k) {
fold.non.positives[[i]] <- integer(0);
fold.positives[[i]] <- integer(0);
}
# examples <- 1:n;
non.positives <- setdiff(examples,positives);
# non.positives <- examples[-positives];
non.positives <- sample(non.positives);
positives <- sample(positives);
n.positives <- length(positives);
resto.positives <- n.positives%%k;
n.pos.per.fold <- (n.positives - resto.positives)/k;
n.non.positives <- length(non.positives);
resto.non.positives <- n.non.positives%%k;
n.non.pos.per.fold <- (n.non.positives - resto.non.positives)/k;
j=1;
if (n.non.pos.per.fold > 0)
for (i in 1:k) {
fold.non.positives[[i]] <- non.positives[j:(j+n.non.pos.per.fold-1)];
j <- j + n.non.pos.per.fold;
}
j.pos=1;
if (n.pos.per.fold > 0)
for (i in 1:k) {
fold.positives[[i]] <- positives[j.pos:(j.pos+n.pos.per.fold-1)];
j.pos <- j.pos + n.pos.per.fold;
}
if (resto.non.positives > 0)
for (i in k:(k-resto.non.positives+1)) {
fold.non.positives[[i]] <- c(fold.non.positives[[i]], non.positives[j]);
j <- j + 1;
}
if (resto.positives > 0)
for (i in 1:resto.positives) {
fold.positives[[i]] <- c(fold.positives[[i]], positives[j.pos]);
j.pos <- j.pos + 1;
}
return(list(fold.non.positives=fold.non.positives, fold.positives=fold.positives));
}
# Function to compute the F-measure for a single class
# Input
# pred : factor or vector of the predicted labels (1 stands for positve, 0 for negative)
# labels : factor or vector of the true labels (1 stands for positve, 0 for negative)
# Note that 0 level stands for negative and 1 for positive.
# In general the first level is negative and the second positive
# Output:
# a named vector with five elements:
# P: precision
# R : recall (sensitivity)
# S : specificity
# F : F measure
# A : 0/1 accuracy
F.measure.single <- function(pred,labels) {
if (length(pred)!=length(labels))
stop("F.measure: lengths of true and predicted labels do not match.");
res <- numeric(5);
names(res) <- c("P", "R", "S", "F", "A");
pred.0 <- which(pred==0);
pred.1 <- which(pred==1);
labels.0<- which(labels==0);
labels.1<- which(labels==1);
TN <- length(intersect(pred.0, labels.0));
TP <- length(intersect(pred.1, labels.1));
FN <- length(intersect(pred.0, labels.1));
FP <- length(intersect(pred.1, labels.0));
acc <- (TP+TN)/length(labels);
if ((TP+FP) == 0)
precision <- 0
else
precision <- TP/(TP+FP);
if ((TP+FN) == 0)
recall <- 0
else
recall <- TP/(TP+FN);
if ((TN+FP) == 0)
specificity <- 0
else
specificity <- TN/(TN+FP);
if ((precision+recall) == 0)
F <- 0
else
F = 2 *(precision*recall) / (precision+recall);
res <- c(precision,recall,specificity,F,acc);
names(res) <- c("P", "R", "S", "F", "A");
return (res);
}
# Function to compute the best F score for a single class.
# Input:
# scores : vector of the predicted scores
# lab : vector of the true labels: 1 positive, 0 negative.
# Output:
# a named vector with six elements:
# P: precision
# R : recall (sensitivity)
# S : specificity
# F : F measure
# A : 0/1 accuracy
# T : best threshold
Fmax <- function (scores, lab) {
max.F=0;
best=0;
unique.scores <- unique(scores);
for (t in unique.scores) {
pred <- ifelse(scores>=t,1,0);
res <- F.measure.single(pred,lab);
if (res["F"] > max.F) {
max.F <- res["F"];
best <- c(res,t);
}
}
names(best)[length(best)] <- "T";
return(best);
}
# Function to compute the best Accruacy for a single class.
# Input:
# scores : vector of the predicted scores
# lab : vector of the true labels: 1 positive, 0 negative.
# Output:
# a named vector with six elements:
# P: precision
# R : recall (sensitivity)
# S : specificity
# F : F measure
# A : 0/1 accuracy
# T : best threshold
Accmax <- function (scores, lab) {
max.Acc=0;
best=0;
unique.scores <- unique(scores);
for (t in unique.scores) {
pred <- ifelse(scores>=t,1,0);
res <- F.measure.single(pred,lab);
if (res["A"] > max.Acc) {
max.Acc <- res["A"];
best <- c(res,t);
}
}
names(best)[length(best)] <- "T";
return(best);
}