# Implemntation of Random walk and Random walk with restart # August 2011 #library(graph); #library(Rgraphviz); #source("NetPreProc.R"); # depends NetPreProc (for the method Prob.norm) library(NetPreProc); norm1 <- function(x) { return(sum(abs(x))); } # Function that performs a random Walk on a given graph # Input: # W : adjacency matrix of the graph # ind.positives: indices of the "core" positive examples of the graph. # They represent to the indices of W corresponding to the positive examples # tmax : maximum number of iterations (def: 1000) # eps : maximum allowed difference between the computed probabilities at the steady state (def. 1e-10) # norm : if TRUE (def) the adjacency matrix W of the graph is normalized to M = D^-1 * W, otherwise # it is assumed that the matrix W is just normalized # Output: # a list with three elements: # - p : the probability at the steady state # - ind.positives: indices of the "core" positive examples of the graph (it is equal to the same # input parameter # - n.iter : number of performed iterations random.walk <- function(W, ind.positives, tmax=1000, eps=1e-10, norm=TRUE) { if (norm) M <-Prob.norm(W) # M = D^-1 * W else M <- W; n <- nrow(M); p0 <- p <- numeric(n); names(p) <- names(p0) <- rownames(W); n.positives <- length(ind.positives); if (n.positives == 0) stop("Random.walk: number of core positives is equal to 0!"); p0[ind.positives] <- 1/n.positives; M <- t(M); p <- p0; for (t in 1:tmax) { pold <- p; p <- M %*% pold; if (norm1(p-pold) < eps) break(); } return(list(p=p, ind.positives=ind.positives, n.iter=t)); } # Function that performs a random Walk with restart (RWR) on a given graph # Input. # W : adjacency matrix of the graph # ind.positives: indices of the "core" positive examples of the graph. # They represent to the indices of W corresponding to the positive examples # gamma : restart parameter (def: 0.6) # tmax : maximum number of iterations (def: 1000) # eps : maximum allowed difference between the computed probabilities at the steady state # norm : if TRUE (def) the adjacency matrix W of the graph is normalized to M = D^-1 * W, otherwise # it is assumed that the matrix W is just normalized # Output: # a list with three elements: # - p : the probability at the steady state # - ind.positives: indices of the "core" positive examples of the graph (it is equal to the same # input parameter # - n.iter : number of performed iterations RWR <- function(W, ind.positives, gamma=0.6, tmax=1000, eps=1e-10, norm=TRUE) { if (norm) M <-Prob.norm(W) # M = D^-1 * W else M <- W; n <- nrow(M); p0 <- p <- numeric(n); names(p) <- names(p0) <- rownames(W); n.positives <- length(ind.positives); if (n.positives == 0) stop("Random.walk: number of core positives is equal to 0!"); p0[ind.positives] <- 1/n.positives; M <- t(M); p <- p0; for (t in 1:tmax) { pold <- p; p <- ((1-gamma) * M %*% pold) + gamma * p0; if (norm1(p-pold) < eps) break(); } return(list(p=p, ind.positives=ind.positives, n.iter=t)); } # Function that implements the Label propagation algorithm of Zhu and Ghahramani # Input: # W : adjacency matrix of the graph # ind.positives: indices of the "core" positive examples of the graph. # They represent to the indices of W corresponding to the positive examples # tmax : maximum number of iterations (def: 1000) # eps : maximum allowed difference between the computed probabilities at the steady state (def. 1e-5) # norm : if TRUE (def) the adjacency matrix W of the graph is normalized to M = D^-1 * W, otherwise # it is assumed that the matrix W is just normalized # Output: # a list with three elements: # - p : the probability at the steady state # - ind.positives: indices of the "core" positive examples of the graph (it is equal to the same # input parameter # - n.iter : number of performed iterations label.prop <- function(W, ind.positives, tmax=1000, eps=1e-5, norm=TRUE) { if (norm) M <-Prob.norm(W) # M = D^-1 * W else M <- W; n <- nrow(M); p <- numeric(n); names(p) <- rownames(W); n.positives <- length(ind.positives); if (n.positives == 0) stop("label.prop: number of core positives is equal to 0!"); p[ind.positives] <- 1; M <- t(M); for (t in 1:tmax) { pold <- p; p <- M %*% pold; p[ind.positives] <- 1; if (norm1(p-pold) < eps) break(); } return(list(p=p, ind.positives=ind.positives, n.iter=t)); } ###################################################################### ## Method to perform a simple cross-validation # Function to execute cross-validation with random walk based method # Input: # W : adjacency matrix of the graph # note that if the optional argument norm=TRUE (def.), the W matrix is normalized, otherwise it # is assumed that W is just normalized # ind.positives: indices of the "core" positive examples of the graph. # They represent to the indices of W corresponding to the positive examples # k : number of folds (def: 5) # init.seed : initial seed for the random generator. If 0 (default) no initialization is performed # fun : function. It must be a randow walk method: # - random.walk (default) # - RWR # - label.prop # ... : optional arguments for the function fun: # - gamma : restart parameter (def: 0.6) (meaningful only for RWR) # - tmax : maximum number of iterations (def: 1000) # - eps : maximum allowed difference between the computed probabilities at the steady state (def. 1e-10) # Output # a vector with the the probabilities for each example at the steady state RW.cv <- function(W, ind.pos, k=5, init.seed=0, fun=random.walk, ...) { if (init.seed!=0) set.seed(init.seed); n <- nrow(W); p <- numeric(n); names(p) <- rownames(W); # Realization of the folds fold <- do.stratified.cv.data(n, ind.pos, k=k, seed=init.seed); # computing scores on the k folds for (i in 1:k) { x <- c(fold$fold.positives[[i]], fold$fold.non.positives[[i]]); core.pos <- integer(0); for (j in 1:k) if (j!=i) core.pos <- c(core.pos, fold$fold.positives[[j]]); p[x] <- fun(W,core.pos, ...)$p[x]; } return(p); } ###################################################################### ## Method to perform multiple cross-validation # Function to execute multiple cross-validation with a kernel-based score method. # It computes the scores by averaging across multiple cross validations # Input: # W : adjacency matrix of the graph # note that if the optional argument norm=TRUE (def.), the W matrix is normalized, otherwise it # is assumed that W is just normalized # ind.positives: indices of the "core" positive examples of the graph. # They represent to the indices of W corresponding to the positive examples # k : number of folds (def: 5) # p : number of repeated cross-validations # init.seed : initial seed for the random generator. If 0 (default) no initialization is performed # fun : function. It must be a randow walk method: # - random.walk (default) # - RWR # - label.prop # ... : optional arguments for the function fun: # - gamma : restart parameter (def: 0.6) (meaningful only for RWR) # - tmax : maximum number of iterations (def: 1000) # - eps : maximum allowed difference between the computed probabilities at the steady state (def. 1e-10) # Output # a vector with the the probabilities for each example at the steady state averaged across multiple cross-validations multiple.RW.cv <- function(W, ind.pos, k=5, p=100, init.seed=0, fun=random.walk, ...) { n <- nrow(W); current.p <- average.p <- numeric(n); # vector of average scores for (v in 1:p) { # Realization of the k folds fold <- do.stratified.cv.data(n, ind.pos, k=k, seed=v+init.seed); # computing scores on the k folds for (i in 1:k) { x <- c(fold$fold.positives[[i]], fold$fold.non.positives[[i]]); core.pos <- integer(0); for (j in 1:k) if (j!=i) core.pos <- c(core.pos, fold$fold.positives[[j]]); current.p[x] <- fun(W,core.pos, ...)$p[x]; } average.p <- average.p + current.p; } return(average.p/p); } ###################################################################### # Function to generate data for the stratified cross-validation. # Input: # examples : indices of the examples (a vector of integer) # positives: vector of integer. Indices of the positive examples. The indices refer to row number of # the data matrix. # k : number of folds (def = 5) # seed : seed of the random generator (def=1). If is set to 0 no initiazitation is performed # Ouptut: # a list with 2 two components # - fold.non.positives : a list with k components. Each component is a vector with the indices of the non positive elements of the fold # - fold.positives : a list with k components. Each component is a vector with the indices of the positive elements of the fold # N.B.: in both elements indices refer to row numbers of the data matrix do.stratified.cv.data <- function(examples, positives, k=5, seed=1) { if (seed!=0) set.seed(seed); fold.non.positives <- fold.positives <- list(); for (i in 1:k) { fold.non.positives[[i]] <- integer(0); fold.positives[[i]] <- integer(0); } # examples <- 1:n; non.positives <- setdiff(examples,positives); # non.positives <- examples[-positives]; non.positives <- sample(non.positives); positives <- sample(positives); n.positives <- length(positives); resto.positives <- n.positives%%k; n.pos.per.fold <- (n.positives - resto.positives)/k; n.non.positives <- length(non.positives); resto.non.positives <- n.non.positives%%k; n.non.pos.per.fold <- (n.non.positives - resto.non.positives)/k; j=1; if (n.non.pos.per.fold > 0) for (i in 1:k) { fold.non.positives[[i]] <- non.positives[j:(j+n.non.pos.per.fold-1)]; j <- j + n.non.pos.per.fold; } j.pos=1; if (n.pos.per.fold > 0) for (i in 1:k) { fold.positives[[i]] <- positives[j.pos:(j.pos+n.pos.per.fold-1)]; j.pos <- j.pos + n.pos.per.fold; } if (resto.non.positives > 0) for (i in k:(k-resto.non.positives+1)) { fold.non.positives[[i]] <- c(fold.non.positives[[i]], non.positives[j]); j <- j + 1; } if (resto.positives > 0) for (i in 1:resto.positives) { fold.positives[[i]] <- c(fold.positives[[i]], positives[j.pos]); j.pos <- j.pos + 1; } return(list(fold.non.positives=fold.non.positives, fold.positives=fold.positives)); }