# Implemntation of Random walk and Random walk with restart
# August 2011

#library(graph);
#library(Rgraphviz);

#source("NetPreProc.R");
# depends NetPreProc (for the method Prob.norm)

library(NetPreProc);

norm1 <- function(x) {
 return(sum(abs(x)));
}


# Function that performs a random Walk on a given graph
# Input:
# W : adjacency matrix of the graph
# ind.positives: indices of the "core" positive examples of the graph.
#                They represent to the indices of W corresponding to the positive examples
# tmax : maximum number of iterations (def: 1000)
# eps : maximum allowed difference between the computed probabilities at the steady state (def. 1e-10)
# norm : if TRUE (def) the adjacency matrix W of the graph is normalized to M = D^-1 * W, otherwise
#        it is assumed that the matrix W is just normalized
# Output:
# a list with three elements:
# - p : the probability at the steady state
# - ind.positives: indices of the "core" positive examples of the graph (it is equal to the same
#                  input parameter
# - n.iter : number of performed iterations
random.walk <- function(W, ind.positives, tmax=1000, eps=1e-10, norm=TRUE) { 
   if (norm) 
     M <-Prob.norm(W) # M = D^-1 * W
   else
     M <- W;
   n <- nrow(M);
   p0 <- p <- numeric(n);
   names(p) <- names(p0) <- rownames(W);
   n.positives <- length(ind.positives);
   if (n.positives == 0)
 	  stop("Random.walk: number of core positives is equal to 0!");
   p0[ind.positives] <- 1/n.positives;
   
   M <- t(M);
   p <- p0;
   for (t in 1:tmax) {
 	 pold <- p;
 	 p <- M %*% pold;
 	 if (norm1(p-pold) < eps) break(); 
   }  
   return(list(p=p, ind.positives=ind.positives, n.iter=t));  
}


# Function that performs a random Walk with restart (RWR) on a given graph
# Input.
# W : adjacency matrix of the graph
# ind.positives: indices of the "core" positive examples of the graph.
#                They represent to the indices of W corresponding to the positive examples
# gamma : restart parameter (def: 0.6)
# tmax : maximum number of iterations (def: 1000)
# eps : maximum allowed difference between the computed probabilities at the steady state
# norm : if TRUE (def) the adjacency matrix W of the graph is normalized to M = D^-1 * W, otherwise
#        it is assumed that the matrix W is just normalized
# Output:
# a list with three elements:
# - p : the probability at the steady state
# - ind.positives: indices of the "core" positive examples of the graph (it is equal to the same
#                  input parameter
# - n.iter : number of performed iterations
RWR <- function(W, ind.positives, gamma=0.6, tmax=1000, eps=1e-10, norm=TRUE) {  
   if (norm) 
     M <-Prob.norm(W) # M = D^-1 * W
   else
     M <- W;
   n <- nrow(M);
   p0 <- p <- numeric(n);
   names(p) <- names(p0) <- rownames(W);
   n.positives <- length(ind.positives);
   if (n.positives == 0)
 	  stop("Random.walk: number of core positives is equal to 0!");
   p0[ind.positives] <- 1/n.positives;
   
   M <- t(M);
   p <- p0;
   for (t in 1:tmax) {
 	 pold <- p;
 	 p <- ((1-gamma) * M %*% pold) + gamma * p0;
 	 if (norm1(p-pold) < eps) break(); 
   }  
   return(list(p=p, ind.positives=ind.positives, n.iter=t));  
}


# Function that implements the Label propagation algorithm of Zhu and Ghahramani
# Input:
# W : adjacency matrix of the graph
# ind.positives: indices of the "core" positive examples of the graph.
#                They represent to the indices of W corresponding to the positive examples
# tmax : maximum number of iterations (def: 1000)
# eps : maximum allowed difference between the computed probabilities at the steady state (def. 1e-5)
# norm : if TRUE (def) the adjacency matrix W of the graph is normalized to M = D^-1 * W, otherwise
#        it is assumed that the matrix W is just normalized
# Output:
# a list with three elements:
# - p : the probability at the steady state
# - ind.positives: indices of the "core" positive examples of the graph (it is equal to the same
#                  input parameter
# - n.iter : number of performed iterations
label.prop <- function(W, ind.positives, tmax=1000, eps=1e-5, norm=TRUE) { 
   if (norm) 
     M <-Prob.norm(W) # M = D^-1 * W
   else
     M <- W;
   n <- nrow(M);
   p <- numeric(n);
   names(p)  <- rownames(W);
   n.positives <- length(ind.positives);
   if (n.positives == 0)
 	  stop("label.prop: number of core positives is equal to 0!");
   p[ind.positives] <- 1;
   
   M <- t(M);
   for (t in 1:tmax) {
 	 pold <- p;
 	 p <- M %*% pold;
	 p[ind.positives] <- 1;
 	 if (norm1(p-pold) < eps) break();  
   }  
   return(list(p=p, ind.positives=ind.positives, n.iter=t));  
}


######################################################################
## Method to perform a simple cross-validation 

# Function to execute cross-validation with random walk based method
# Input:
# W : adjacency matrix of the graph
#     note that if the optional argument norm=TRUE (def.), the W matrix is normalized, otherwise it
#     is assumed that W is just normalized
# ind.positives: indices of the "core" positive examples of the graph.
#                They represent to the indices of W corresponding to the positive examples
# k : number of folds (def: 5)
# init.seed : initial seed for the random generator. If 0 (default) no initialization is performed
# fun : function. It must be a randow walk method:
#                 - random.walk (default)
#                 - RWR
#                 - label.prop
# ... : optional arguments for the function fun:
# - gamma : restart parameter (def: 0.6) (meaningful only for RWR)
# - tmax : maximum number of iterations (def: 1000)
# - eps : maximum allowed difference between the computed probabilities at the steady state (def. 1e-10)
# Output
# a vector with the the probabilities for each example at the steady state

RW.cv <- function(W, ind.pos, k=5, init.seed=0, fun=random.walk, ...) {
   if (init.seed!=0)
     set.seed(init.seed);
   n <- nrow(W);
   p <- numeric(n);
   names(p) <- rownames(W);
   
   # Realization of the folds
   fold <- do.stratified.cv.data(n, ind.pos, k=k, seed=init.seed);
   
   # computing scores on the k folds
   for (i in 1:k) {
     x <- c(fold$fold.positives[[i]], fold$fold.non.positives[[i]]);
     core.pos <- integer(0);
     for (j in 1:k)
       if (j!=i)
	 core.pos <- c(core.pos, fold$fold.positives[[j]]);
     p[x] <- fun(W,core.pos, ...)$p[x];
   }
   return(p);
}


######################################################################
## Method to perform multiple cross-validation

# Function to execute multiple cross-validation with a kernel-based score method.
# It computes the scores by averaging across multiple cross validations
# Input:
# W : adjacency matrix of the graph
#     note that if the optional argument norm=TRUE (def.), the W matrix is normalized, otherwise it
#     is assumed that W is just normalized
# ind.positives: indices of the "core" positive examples of the graph.
#                They represent to the indices of W corresponding to the positive examples
# k : number of folds (def: 5)
# p : number of repeated cross-validations
# init.seed : initial seed for the random generator. If 0 (default) no initialization is performed
# fun : function. It must be a randow walk method:
#                 - random.walk (default)
#                 - RWR
#                 - label.prop
# ... : optional arguments for the function fun:
# - gamma : restart parameter (def: 0.6) (meaningful only for RWR)
# - tmax : maximum number of iterations (def: 1000)
# - eps : maximum allowed difference between the computed probabilities at the steady state (def. 1e-10)
# Output
# a vector with the the probabilities for each example at the steady state averaged across multiple cross-validations
multiple.RW.cv <- function(W, ind.pos, k=5, p=100, init.seed=0, fun=random.walk, ...) {
   n <- nrow(W);
   current.p <- average.p <- numeric(n);  # vector of average scores
   
   for (v in 1:p) {
	 # Realization of the k folds
	 fold <- do.stratified.cv.data(n, ind.pos, k=k, seed=v+init.seed);
	 # computing scores on the k folds
	 for (i in 1:k) {
       x <- c(fold$fold.positives[[i]], fold$fold.non.positives[[i]]);
       core.pos <- integer(0);
       for (j in 1:k)
    	 if (j!=i)
	   core.pos <- c(core.pos, fold$fold.positives[[j]]);
       current.p[x] <- fun(W,core.pos, ...)$p[x]; 
	 }
	 average.p <- average.p + current.p;
   }
   return(average.p/p);
}

######################################################################
# Function to generate data for the stratified cross-validation.
# Input:
# examples : indices of the examples (a vector of integer)
# positives: vector of integer. Indices of the positive examples. The indices refer to row number of
#            the data matrix.
# k : number of folds (def = 5)
# seed : seed of the random generator (def=1). If is set to 0 no initiazitation is performed
# Ouptut:
# a list with 2 two components
#   - fold.non.positives : a list with k components. Each component is a vector with the indices of the non positive elements of the fold
#   - fold.positives : a list with k components. Each component is a vector with the indices of the positive elements of the fold
# N.B.: in both elements indices refer to row numbers of the data matrix	 
do.stratified.cv.data <- function(examples, positives, k=5, seed=1) {
  if (seed!=0)
    set.seed(seed);
  fold.non.positives <- fold.positives <- list();
  for (i in 1:k) {
    fold.non.positives[[i]] <- integer(0);
    fold.positives[[i]] <- integer(0);
  }
  # examples <- 1:n;
  non.positives <- setdiff(examples,positives);
  # non.positives <- examples[-positives];
  non.positives <- sample(non.positives);
  positives <- sample(positives);
  n.positives <- length(positives);
  resto.positives <- n.positives%%k;
  n.pos.per.fold <- (n.positives - resto.positives)/k;
  n.non.positives <- length(non.positives);
  resto.non.positives <- n.non.positives%%k;
  n.non.pos.per.fold <- (n.non.positives - resto.non.positives)/k;
  j=1; 
  if (n.non.pos.per.fold > 0)
    for (i in 1:k) {
      fold.non.positives[[i]] <- non.positives[j:(j+n.non.pos.per.fold-1)];
      j <- j + n.non.pos.per.fold;
    }
  j.pos=1;  
  if (n.pos.per.fold > 0)
    for (i in 1:k) {
      fold.positives[[i]] <- positives[j.pos:(j.pos+n.pos.per.fold-1)];
      j.pos <- j.pos + n.pos.per.fold;
    }
  
  if (resto.non.positives > 0)
    for (i in k:(k-resto.non.positives+1)) {
      fold.non.positives[[i]] <- c(fold.non.positives[[i]], non.positives[j]);
      j <- j + 1;
    }
  
  if (resto.positives > 0) 
    for (i in 1:resto.positives) {
      fold.positives[[i]] <- c(fold.positives[[i]], positives[j.pos]);
      j.pos <- j.pos + 1;
    }
  
  return(list(fold.non.positives=fold.non.positives, fold.positives=fold.positives));
}