################################################################ # R source code for random projections # January 2005 ################################################################ library(stats) library(MASS) #source("rData.R"); ############################################################################## # Random projections ############################################################################## # Generation of the vector containing the indices randomly selected. # It is used by the function random.subspace # Input: # d : subspace dimension # d.original : dimension of the space from which components are randomly selected # Output: # vector of the selected features: it contain the indices of the components randomly selected random.component.selection <- function(d=2, d.original=10) { selected.features <- numeric(d); n.feat <- d.original+1; feat <- floor(runif(1,1,n.feat)); selected.features[1] <- feat; for (i in 2:d) { present <- TRUE; while(present) { feat <- floor(runif(1,1,n.feat)); for (j in 1:(i-1)) { if (selected.features[j] == feat) break; } if ((j==i-1) && (selected.features[j] != feat)) { present<-FALSE; selected.features[i] <- feat; } } } selected.features } # Random projections to a lower dimension subspace (random subspace method) # The projection is performed randomly selecting a subset of variables (components) and then projecting # the data onto the selected components. # It is the projection used by Ho for its Random subspace ensemble algorithm. # Input: # d : subspace dimension # m : data matrix (rows are features and columns are examples) # scaling : if TRUE (default) scaling is performed # Output: # data matrix (dimension d X ncol(m)) of the examples projected in a d-dimensional random subspace random.subspace <- function(d=2, m, scaling=TRUE){ d.original <- nrow(m); if (d >= d.original) stop("random.subspace: subspace dimension must be lower than space dimension", call.=FALSE); # generation of the vector selected.features containing the indices randomly selected selected.features <- random.component.selection(d, d.original); # random data projection if (scaling == TRUE) reduced.m <- sqrt(d.original/d) * m[selected.features,] else reduced.m <- m[selected.features,]; reduced.m } # Random projections to a lower dimension subspace with a normal distributed projection matrix # The projection is performed using a normally distributed projection matrix P: each P[i,j] ~ N(0,1). # Input: # d : subspace dimension # m : data matrix (rows are features and columns are examples) # scaling : if TRUE (default) scaling is performed # Output: # data matrix (dimension d X ncol(m)) of the examples projected in a d-dimensional random subspace norm.random.projection <- function(d=2, m, scaling=TRUE){ d.original <- nrow(m); if (d >= d.original) stop("norm.random.projection: subspace dimension must be lower than space dimension", call.=FALSE); # Projection matrix P <- rnorm(d*d.original); P <- matrix(P, nrow=d); # random data projection if (scaling == TRUE) reduced.m <- sqrt(1/d) * (P%*%m) else reduced.m <- P%*%m; reduced.m } # Random projections to a lower dimensional subspace with a random +1/-1 projection matrix # The projection is performed using a projection matrix P s.t. Prob(P[i,j]=1)=Prob(P[i,j]=-1)=1/2 # Input: # d : subspace dimension # m : data matrix (rows are features and columns are examples) # scaling : if TRUE (default) scaling is performed # Output: # data matrix (dimension d X ncol(m)) of the examples projected in a d-dimensional random subspace Plus.Minus.One.random.projection <- function(d=2, m, scaling=TRUE){ d.original <- nrow(m); if (d >= d.original) stop("norm.random.projection: subspace dimension must be lower than space dimension", call.=FALSE); # Projection matrix P <- floor(runif(d*d.original,1,3)); # generate a vector 1-2 valued P[P==2] <- -1; P <- matrix(P, nrow=d); # random data projection if (scaling == TRUE) reduced.m <- (P%*%m) * sqrt(1/d) else reduced.m <- P%*%m; reduced.m } # Random projections to a lower dimension subspace with the Achlioptas' projection matrix # The projection is performed using a projection matrix P s.t. Prob(P[i,j]=sqrt(3))=Prob(P[i,j]=-sqrt(3)=1/6; # Prob(P[i,j]=0)=2/3 # Input: # d : subspace dimension # m : data matrix (rows are features and columns are examples) # scaling : if TRUE (default) scaling is performed # Output: # data matrix (dimension d X ncol(m)) of the examples projected in a d-dimensional random subspace Achlioptas.random.projection <- function(d=2, m, scaling=TRUE){ d.original <- nrow(m); if (d >= d.original) stop("norm.random.projection: subspace dimension must be lower than space dimension", call.=FALSE); # Projection matrix P <- floor(runif(d*d.original,1,7)); # generate a vector 1 to 6 valued sqr3 <- sqrt(3); P[P==1] <- sqr3; P[P==6] <- -sqr3; P[P==2 | P==3 | P==4 | P==5] <- 0; P <- matrix(P, nrow=d); # random data projection if (scaling == TRUE) reduced.m <- sqrt(1/d) * (P%*%m) else reduced.m <- P%*%m;; reduced.m } ########################################################### # Distortion measures ########################################################### # It computes the maximum expansion achieved by projecting from the original to the reduced space. # expansion = max_{u,v \in S} \frac {|| f(u) - f(v) ||} {|| u - v ||} # Input: # m : data matrix in the original space (rows are are examples, columns are components) # m.rid : data matrix in the reduced space (rows are are examples, columns are components) # Output: # expansion Max.Expansion <- function(m, m.rid) { d <- dist(m); # it computes the euclidean distance between all pairs of rows of m d.rid <- dist(m.rid); exps <- max(d.rid/d); exps } # It computes the minimum expansion achieved by projecting from the original to the reduced space. # min.expansion = min_{u,v \in S} \frac {|| f(u) - f(v) ||} {|| u - v ||} # Input: # m : data matrix in the original space (rows are are examples, columns are components) # m.rid : data matrix in the reduced space (rows are are examples, columns are components) # Output: # minimum expansion Min.Expansion <- function(m, m.rid) { d <- dist(m); # it computes the euclidean distance between all pairs of rows of m d.rid <- dist(m.rid); min.exps <- min(d.rid/d); min.exps } # It computes the maximum and minimum expansion achieved by projecting from the original to the reduced space. # max.expansion = max_{u,v \in S} \frac {|| f(u) - f(v) ||} {|| u - v ||} # min.expansion = min_{u,v \in S} \frac {|| f(u) - f(v) ||} {|| u - v ||} # Input: # m : data matrix in the original space (rows are are examples, columns are components) # m.rid : data matrix in the reduced space (rows are are examples, columns are components) # Output: # maximum and minimum expansion (a vector with 2 components) Max.Min.Expansion <- function(m, m.rid) { d <- dist(m); # it computes the euclidean distance between all pairs of rows of m d.rid <- dist(m.rid); max.exps <- max(d.rid/d); min.exps <- min(d.rid/d); exps <- c(max.exps,min.exps); exps } # It computes the average expansion achieved by projecting from the original to the reduced space. # average.expansion = 1/(|S|*(|S|-1)/2) sum_{u,v \in S} \frac{|| f(u) - f(v) ||}{|| u - v ||} # Input: # m : data matrix in the original space (rows are are examples, columns are components) # m.rid : data matrix in the reduced space (rows are are examples, columns are components) # Output: # average.expansion Average.Expansion <- function(m, m.rid) { d <- dist(m); # it computes the euclidean distance between all pairs of rows of m d.rid <- dist(m.rid); s <- length(d); average.expansion <- (1/s) * sum(d.rid/d); average.expansion } # It computes the maximum contraction achieved by projecting from the original to the reduced space. # contraction = max_{u,v \in S} \frac{|| u - v ||}{|| f(u) - f(v) ||} # Input: # m : data matrix in the original space (rows are are examples, columns are components) # m.rid : data matrix in the reduced space (rows are are examples, columns are components) # Output: # contraction Max.Contraction <- function(m, m.rid) { d <- dist(m); # it computes the euclidean distance between all pairs of rows of m d.rid <- dist(m.rid); contr <- max(d/d.rid); contr } # It computes the maximum and minimum contraction achieved by projecting from the original to the reduced space. # max.contraction = max_{u,v \in S} \frac{|| u - v ||}{|| f(u) - f(v) ||} # min.contraction = min_{u,v \in S} \frac{|| u - v ||}{|| f(u) - f(v) ||} # Input: # m : data matrix in the original space (rows are are examples, columns are components) # m.rid : data matrix in the reduced space (rows are are examples, columns are components) # Output: # max and min contraction (a vector with 2 components) Max.Min.Contraction <- function(m, m.rid) { d <- dist(m); # it computes the euclidean distance between all pairs of rows of m d.rid <- dist(m.rid); max.contr <- max(d/d.rid); min.contr <- min(d/d.rid); contr <- c(max.contr,min.contr); contr } # It computes the average contraction achieved by projecting from the original to the reduced space. # average.contraction = 1/(|S|*(|S|-1)/2) sum_{u,v \in S} \frac{|| u - v ||}{|| f(u) - f(v) ||} # Input: # m : data matrix in the original space (rows are are examples, columns are components) # m.rid : data matrix in the reduced space (rows are are examples, columns are components) # Output: # average.contraction Average.Contraction <- function(m, m.rid) { d <- dist(m); # it computes the euclidean distance between all pairs of rows of m d.rid <- dist(m.rid); s <- length(d); average.contraction <- (1/s) * sum(d/d.rid); average.contraction } # Prediction of the dimension of random projection we need to obtain a given distortion according to JL lemma # Input: # n : cardinality of the data # epsilon : distortion (0 < epsilon <= 0.5) # Output: # minimum dimension JL.predict.dim <- function(n, epsilon=0.5) { d <- 4 * (log(n) / epsilon^2); ceiling(d) } # Prediction of the dimension of random projection we need to obtain a given distortion according to JL lemma # when t multiple projections are performed # Input: # n : cardinality of the data # epsilon : distortion (0 < epsilon <= 0.5) # t : number of multiple projections # Output: # minimum dimension JL.predict.dim.multiple <- function(n, epsilon=0.5, t=10) { d <- 4 * ((log(n)+log(t)) / epsilon^2); ceiling(d) } # Prediction of the minimum distortion of random projection for a given subspace dimension according to JL lemma # Input: # n : cardinality of the data # dim : dimension of the projected subspace # Output: # minimum distortion JL.predict.distortion <- function(n, dim=10) { epsilon <- sqrt(4 * log(n) / dim); epsilon }