############################################################################################################## # clusterv2.R # August 2005 # modified february 2006 # Set of functions to perform multiple clustering on randomly projected data # using a specific clustering algorithm ############################################################################################################## library(stats); library(cluster); #source("rp.R"); #source("clusterv.R"); ############################################################################ # Vector to list cluster representation # It transformst a clustering from a vector representation to a list representation. # It accepts as input a vector that represents a clustering; the indices of the vectors refer to the examples and their # integer contento to the number of the cluster they belong. The function returns a list that represents the same # clustering; each element of the list is a vector representing the cluster. The elements of the vectors are the indices # of the examples (that is they correspond to the indices of the vector representation of the clustering). # This list representation of the cluster may be used to compute the validity indices of the clustering. # Input: # v : vector representing the clustering: # Output: # a list that represents the clustering Transform.vector.to.list <- function (v) { if (is.integer(v) == FALSE) stop("Transform.vector.to.list: the elements of the input vector must be integers", call.=FALSE); # relabeling of the classes in order to avoid (that is the labels of the classes will be consecutive integers) n.examples <- length(v); new.v <- integer(n.examples); # vector with relabeled elements max.class <- max(v); v.index <- integer(max.class); # vector of the class indices for label translation label.class <- 0; for (i in 1:n.examples) { old.label <- v[i]; # old label of the ith example if (v.index[old.label] == 0) { label.class <- label.class + 1; v.index[old.label] <- label.class; } new.v[i] <- v.index[old.label]; } # building of the list of clusters cl =list(); for (i in 1:n.examples) { if (length(cl) < new.v[i]) cl[[new.v[i]]] <- i else cl[[new.v[i]]][length(cl[[new.v[i]]]) + 1] <- i; } return(cl); } ######### K-MEANS CLUSTERING ############################### ########################## # Multiple Random k-means clustering. # Multiple Random k-means clusterings are computed using random projections of data. # It assumes that the label of the examples are integers starting from 1 to ncol(M). # Several randomized maps may be used: RS, PMO, Normal and Achliopotas random projections # Input: # M : matrix of data: rows are variables and columns are examples # dim : subspace dimension # c : number of clusters # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # n : number of RS projections # it.max : maximum number of iteration of the k-means algorithm (default 1000) # scale : if TRUE randomized projections are scaled (default) # seed : numerical seed for the random generator # Output: # a list of the n clusterings obtained by randomized hierarchical clustering Multiple.Random.kmeans <- function(M, dim, pmethod="PMO", c=3, n=50, it.max=1000, scale=TRUE, seed=100, distance="euclidean") { set.seed(seed); cl <- list(); for (i in 1:n) { # A. selection of the randomized map P.M <- switch(pmethod, RS = random.subspace(d=dim, M, scaling=scale), PMO = Plus.Minus.One.random.projection(d=dim, M, scaling=scale), Norm = norm.random.projection(d=dim, M, scaling=scale), Achlioptas = Achlioptas.random.projection(d=dim, M, scaling=scale), stop("Multiple.Random.kmeans: not supported random projection.", call.=FALSE)); r<-kmeans(t(P.M), c, iter.max = it.max); cl[[i]] <- Transform.vector.to.list(r$cluster); } return(cl); } ########################## # Random k-means clustering and validity indices computation using random projections of data. # This function applies a k-means clustering algorithm to the data and then computes stability indices for the # obtained cluster using multiple random subspace projections. # It computes the validity indices for each cluster found in the original space, the overall validity index for the # clsutering and (optionallly) the set of the AC indices. # Different randomized maps (e.g. PMO, Achlioptas, Normal, Random Subspace projections) may be applied. # It assumes that the label of the examples are integer starting from 1 to ncol(M). # Note that the k-means algorithm strongly depends from the initial conditions. Hence choosing different random seed # we may obtain different results; setting seed=-1 (default) each time a different random seed is chosen. # Input: # M : matrix of data: rows are variables and columns are examples # dim : subspace dimension # c : number of cluster # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # it.max : maximum number of iteration of the k-means algorithm (default 1000) # n : number of RS projections # scale : if TRUE randomized projections are scaled # seed : numerical seed for the random generator (if seed= -1 a different seed is generated each time) # AC: if TRUE (default) the AC indices are computed. # Output: # a list with six components: "validity", "overall.validity", "similarity.matrix", "dim", # "cluster", "orig.cluster": # "validity" is a vector with the validity of each of the c clusters; # "overall.validity" is the validity index of the overall cluster # "similarity.matrix" is the pairwise similarity matrix between examples. # "dimension" is the dimension of the random subspace dimension. # "cluster" is the list of the n clustering obtained by multiple k-means clustering on the projected subspace # "orig.cluster" is the list of the clusters in the original space Random.kmeans.validity <- function(M, dim, pmethod="PMO", c=3, it.max=1000, n=50, scale=TRUE, seed=-1, AC=TRUE) { dim.Sim.M <- ncol(M); if (seed == -1) seed <- round(runif(1,1,10000)); # Computing the clusters in the original space r<-kmeans(t(M), c, iter.max = it.max); cl.orig <- Transform.vector.to.list(r$cluster); # Perform multiple clusterings using randomized embeddings cl <- Multiple.Random.kmeans (M=M, dim=dim, pmethod=pmethod, c=c, n=n, it.max=it.max, scale=scale, seed=seed); # Update similarity matrix Sim.M <- Do.similarity.matrix(cl, dim.Sim.M); # Computing the list of validity measures # Computing the validity indices vi c <- length(cl.orig); # it corrects for empty sets vi <- Validity.indices(cl.orig, c, Sim.M); # Computing overall (average) validity of the clustering: ov.vi <- sum(vi)/c; # Computing the AC indices: if (AC == TRUE) { ac <- AC.index(cl.orig, c, Sim.M); res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, cluster=cl, orig.cluster=cl.orig, AC=ac); } else res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, cluster=cl, orig.cluster=cl.orig); return(res); } ################################################################## ######### FUZZY K-MEANS CLUSTERING ############################### ################################################################## ########################## # Multiple Random fuzzy-k-means clustering. # Multiple Random fuzzy-k-means clusterings are computed using random projections of data. # The crisp clustering is obtained by defuzzification via the nearest crisp clustering: each example is assigned to the # cluster for which it has the largest membership. The base fuzzy algorithm used is \code{fanny} of the \code{cluster} # package. # It assumes that the label of the examples are integers starting from 1 to ncol(M). # Several randomized maps may be used: RS, PMO, Normal and Achliopotas random projections # Input: # M : matrix of data: rows are variables and columns are examples # dim : subspace dimension # c : number of clusters # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # n : number of RS projections # scale : if TRUE randomized projections are scaled (default) # seed : numerical seed for the random generator # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # Output: # a list of the n clusterings obtained by the fuzzy-k-means algorithm clustering Multiple.Random.fuzzy.kmeans <- function(M, dim, pmethod="PMO", c=3, n=50, scale=TRUE, seed=-1, distance="euclidean") { if (seed == -1) seed <- round(runif(1,1,10000)); set.seed(seed); cl <- list(); for (i in 1:n) { # A. selection of the randomized map P.M <- switch(pmethod, RS = random.subspace(d=dim, M, scaling=scale), PMO = Plus.Minus.One.random.projection(d=dim, M, scaling=scale), Norm = norm.random.projection(d=dim, M, scaling=scale), Achlioptas = Achlioptas.random.projection(d=dim, M, scaling=scale), stop("Multiple.Random.kmeans: not supported random projection.", call.=FALSE)); if (distance == "euclidean") d <- dist (t(P.M)) else d <- as.dist(1 - cor(P.M)); r<- fanny(d, c, maxit = 1000); cl[[i]] <- Transform.vector.to.list(r$clustering); } return(cl); } ########################## # Random fuzzy-k-means clustering and validity indices computation using random projections of data. # This function applies the fuzzy-k-means clustering algorithm to the data and then computes stability indices for the # obtained cluster using multiple random subspace projections. # It computes the validity indices for each cluster found in the original space, the overall validity index for the # clsutering and (optionallly) the set of the AC indices. # Different randomized maps (e.g. PMO, Achlioptas, Normal, Random Subspace projections) may be applied. # It assumes that the label of the examples are integer starting from 1 to ncol(M). # Note that the fuzzy-k-means algorithm strongly depends from the initial conditions. Hence choosing different random seed # we may obtain different results; setting seed=-1 (default) each time a different random seed is chosen. # Input: # M : matrix of data: rows are variables and columns are examples # dim : subspace dimension # c : number of cluster # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # n : number of RS projections # scale : if TRUE randomized projections are scaled # seed : numerical seed for the random generator (if seed= -1 a different seed is generated each time) # AC: if TRUE (default) the AC indices are computed. # Output: # a list with six components: "validity", "overall.validity", "similarity.matrix", "dim", # "cluster", "orig.cluster": # "validity" is a vector with the validity of each of the c clusters; # "overall.validity" is the validity index of the overall cluster # "similarity.matrix" is the pairwise similarity matrix between examples. # "dimension" is the dimension of the random subspace dimension. # "cluster" is the list of the n clustering obtained by multiple fuzzy-k-means clustering on the projected subspace # "orig.cluster" is the list of the clusters in the original space Random.fuzzy.kmeans.validity <- function(M, dim, pmethod="PMO", c=3, n=50, scale=TRUE, seed=-1, AC=TRUE, distance="euclidean") { dim.Sim.M <- ncol(M); if (seed == -1) seed <- round(runif(1,1,10000)); # Computing the clusters in the original space if (distance == "euclidean") d <- dist (t(M)) else d <- as.dist(1 - cor(M)); r<- fanny(d, c, maxit = 1000); cl.orig <- Transform.vector.to.list(r$clustering); # Perform multiple clusterings using randomized embeddings cl <- Multiple.Random.fuzzy.kmeans (M=M, dim=dim, pmethod=pmethod, c=c, n=n, scale=scale, seed=seed, distance=distance); # Update similarity matrix Sim.M <- Do.similarity.matrix(cl, dim.Sim.M); # Computing the list of validity measures # Computing the validity indices vi c <- length(cl.orig); vi <- Validity.indices(cl.orig, c, Sim.M); # Computing overall (average) validity of the clustering: ov.vi <- sum(vi)/c; # Computing the AC indices: if (AC == TRUE) { ac <- AC.index(cl.orig, c, Sim.M); res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, cluster=cl, orig.cluster=cl.orig, AC=ac); } else res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, cluster=cl, orig.cluster=cl.orig); return(res); } ################################################################## ######### PAM CLUSTERING ######################################### ################################################################## ########################## # Multiple Random PAM clustering. # Multiple Random Partition Around Meodoids (PAM) clusterings are computed using random projections of data. # The \code{pam} function of the package \code{cluster} is used as implementation of the base PAM algorithm. # It assumes that the label of the examples are integers starting from 1 to ncol(M). # Several randomized maps may be used: RS, PMO, Normal and Achliopotas random projections # Input: # M : matrix of data: rows are variables and columns are examples # dim : subspace dimension # c : number of clusters # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # n : number of RS projections # scale : if TRUE randomized projections are scaled (default) # seed : numerical seed for the random generator # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # Output: # a list of the n clusterings obtained by the PAM algorithm clustering Multiple.Random.PAM <- function(M, dim, pmethod="PMO", c=3, n=50, scale=TRUE, seed=-1, distance="euclidean") { if (seed == -1) seed <- round(runif(1,1,10000)); set.seed(seed); cl <- list(); for (i in 1:n) { # A. selection of the randomized map P.M <- switch(pmethod, RS = random.subspace(d=dim, M, scaling=scale), PMO = Plus.Minus.One.random.projection(d=dim, M, scaling=scale), Norm = norm.random.projection(d=dim, M, scaling=scale), Achlioptas = Achlioptas.random.projection(d=dim, M, scaling=scale), stop("Multiple.Random.kmeans: not supported random projection.", call.=FALSE)); if (distance == "euclidean") d <- dist (t(P.M)) else d <- as.dist(1 - cor(P.M)); cl.v <- pam (d,c,cluster.only=TRUE); cl[[i]] <- Transform.vector.to.list(cl.v); } return(cl); } ########################## # Random PAM clustering and validity indices computation using random projections of data. # This function applies the Prediction Arounf Medoids (PAM) clustering algorithm to the data and then computes # stability indices for the obtained cluster using multiple random subspace projections. # It computes the validity indices for each cluster found in the original space, the overall validity index for the # clsutering and (optionallly) the set of the AC indices. # Different randomized maps (e.g. PMO, Achlioptas, Normal, Random Subspace projections) may be applied. # It assumes that the label of the examples are integer starting from 1 to ncol(M). # The \code{pam} function of the package \code{cluster} is used as implementation of the base PAM algorithm. # Input: # M : matrix of data: rows are variables and columns are examples # dim : subspace dimension # c : number of cluster # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # n : number of RS projections # scale : if TRUE randomized projections are scaled # seed : numerical seed for the random generator (if seed= -1 a different seed is generated each time) # AC: if TRUE (default) the AC indices are computed. # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # Output: # a list with six components: "validity", "overall.validity", "similarity.matrix", "dim", # "cluster", "orig.cluster": # "validity" is a vector with the validity of each of the c clusters; # "overall.validity" is the validity index of the overall cluster # "similarity.matrix" is the pairwise similarity matrix between examples. # "dimension" is the dimension of the random subspace dimension. # "cluster" is the list of the n clustering obtained by multiple fuzzy-k-means clustering on the projected subspace # "orig.cluster" is the list of the clusters in the original space Random.PAM.validity <- function(M, dim, pmethod="PMO", c=3, n=50, scale=TRUE, seed=-1, AC=TRUE, distance="euclidean") { dim.Sim.M <- ncol(M); if (seed == -1) seed <- round(runif(1,1,10000)); # Computing the clusters in the original space if (distance == "euclidean") d <- dist (t(M)) else d <- as.dist(1 - cor(M)); cl.v <- pam (d,c,cluster.only=TRUE); cl.orig <- Transform.vector.to.list(cl.v); # Perform multiple clusterings using randomized embeddings cl <- Multiple.Random.PAM (M=M, dim=dim, pmethod=pmethod, c=c, n=n, scale=scale, seed=seed, distance=distance); # Update similarity matrix Sim.M <- Do.similarity.matrix(cl, dim.Sim.M); # Computing the list of validity measures # Computing the validity indices vi c <- length(cl.orig); vi <- Validity.indices(cl.orig, c, Sim.M); # Computing overall (average) validity of the clustering: ov.vi <- sum(vi)/c; # Computing the AC indices: if (AC == TRUE) { ac <- AC.index(cl.orig, c, Sim.M); res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, cluster=cl, orig.cluster=cl.orig, AC=ac); } else res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, cluster=cl, orig.cluster=cl.orig); return(res); }