##############################################################################################################
# clusterv2.R
# August 2005
# modified february 2006
# Set of functions to perform multiple clustering on randomly projected data 
# using a specific clustering algorithm
##############################################################################################################

library(stats);
library(cluster);
#source("rp.R");
#source("clusterv.R");

############################################################################
# Vector to list cluster representation
# It transformst  a clustering from a vector representation to a list representation.
# It accepts as input a vector that represents a clustering; the indices of the vectors refer to the examples and their
# integer contento to the number of the cluster they belong. The function returns a list that represents the same
# clustering; each element of the list is a vector representing the cluster. The elements of the vectors are the indices
# of the examples (that is they correspond to the indices of the vector representation of the clustering).
# This list representation of the cluster may be used to compute the validity indices of the clustering.
# Input:
# v : vector representing the clustering:  
# Output:
# a list  that represents the clustering
Transform.vector.to.list <- function (v) {
  if (is.integer(v) == FALSE)
	  stop("Transform.vector.to.list: the elements of the input vector must be integers", call.=FALSE);
	# relabeling of the classes in order to avoid (that is the labels of the classes will be consecutive integers)
	n.examples <- length(v);
	new.v <- integer(n.examples); # vector with relabeled elements
	max.class <- max(v);
	v.index <- integer(max.class); # vector of the class indices for label translation
	label.class <- 0;
	
	for (i in 1:n.examples) {
	  old.label <- v[i]; # old label of the ith example
		if (v.index[old.label] == 0) {
		  label.class <- label.class + 1;
			v.index[old.label] <- label.class;
	  }
		new.v[i] <- v.index[old.label];	
	}
	# building of the list of clusters
	cl =list();
	for (i in 1:n.examples) {
	  if (length(cl) < new.v[i]) 
	    cl[[new.v[i]]] <- i
		else  
		  cl[[new.v[i]]][length(cl[[new.v[i]]]) + 1] <- i;
	}
	return(cl);
}


######### K-MEANS CLUSTERING ###############################

##########################
# Multiple Random k-means clustering.
# Multiple Random k-means clusterings are computed using random projections of data.
# It assumes that the label of the examples are integers starting from 1 to ncol(M).
# Several randomized maps may be used: RS, PMO, Normal and Achliopotas random projections
# Input:
# M : matrix of data: rows are variables and columns are examples
# dim : subspace dimension
# c : number of clusters
# pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# n : number of RS projections
# it.max : maximum number of iteration of the k-means algorithm (default 1000)
# scale : if TRUE randomized projections are scaled (default)
# seed : numerical seed for the random generator
# Output:
# a list  of the n clusterings obtained by randomized hierarchical clustering 
Multiple.Random.kmeans <- function(M, dim, pmethod="PMO", c=3,  n=50, it.max=1000, scale=TRUE, seed=100, distance="euclidean") {
  set.seed(seed);
	cl <- list();
	for (i in 1:n) {
	  # A. selection of the randomized map
	  P.M <- switch(pmethod,
	                RS = random.subspace(d=dim, M, scaling=scale),
	                PMO = Plus.Minus.One.random.projection(d=dim, M, scaling=scale),
	                Norm = norm.random.projection(d=dim, M, scaling=scale),
	                Achlioptas = Achlioptas.random.projection(d=dim, M, scaling=scale),
									stop("Multiple.Random.kmeans: not supported random projection.", call.=FALSE));
		r<-kmeans(t(P.M), c, iter.max = it.max);
		cl[[i]] <- Transform.vector.to.list(r$cluster);
	}
	return(cl);								 
}


##########################
# Random k-means clustering and validity indices computation using random projections of data.
# This function applies a k-means clustering algorithm to the data and then computes stability indices for the
# obtained cluster using multiple random subspace projections. 
# It computes the validity indices for each cluster found in the original space, the overall validity index for the
# clsutering and (optionallly) the set of the AC indices.
# Different randomized maps (e.g. PMO, Achlioptas, Normal, Random Subspace projections) may be applied.
# It assumes that the label of the examples are integer starting from 1 to ncol(M).
# Note that the k-means algorithm strongly depends from the initial conditions. Hence choosing  different random seed
# we may obtain different results; setting seed=-1 (default) each time a different random seed is chosen.
# Input:
# M : matrix of data: rows are variables and columns are examples
# dim : subspace dimension
# c : number of cluster
# pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# it.max : maximum number of iteration of the k-means algorithm (default 1000)
# n : number of RS projections
# scale : if TRUE randomized projections are scaled 
# seed : numerical seed for the random generator (if seed= -1 a different seed is generated each time)
# AC:   if TRUE (default) the AC indices are computed.
# Output:
# a list with six components: "validity", "overall.validity", "similarity.matrix", "dim", 
# "cluster", "orig.cluster":
# "validity" is a vector with the validity of each of the c clusters;
# "overall.validity" is the validity index of the overall cluster
# "similarity.matrix" is the pairwise similarity matrix between examples.
# "dimension" is the dimension of the random subspace dimension.
# "cluster" is the list of the n clustering obtained by multiple k-means clustering on the projected subspace
# "orig.cluster" is the list of the clusters in the original space
Random.kmeans.validity <- function(M, dim, pmethod="PMO", c=3, it.max=1000, n=50, scale=TRUE, seed=-1, AC=TRUE) {
  dim.Sim.M <- ncol(M);
	if (seed == -1)
	   seed <- round(runif(1,1,10000));
	# Computing the clusters in the original space	
	r<-kmeans(t(M), c, iter.max = it.max);
	cl.orig <- Transform.vector.to.list(r$cluster);
	
	# Perform multiple clusterings using randomized embeddings
	cl <- Multiple.Random.kmeans (M=M, dim=dim, pmethod=pmethod, c=c, n=n, it.max=it.max, scale=scale, seed=seed);
	
	# Update similarity matrix
	Sim.M <- Do.similarity.matrix(cl, dim.Sim.M);
	
	# Computing the list of validity measures
	# Computing the validity indices vi
	c <- length(cl.orig);  # it corrects for empty sets
	vi <- Validity.indices(cl.orig, c, Sim.M);
	
	# Computing overall (average) validity of the clustering:
	ov.vi <- sum(vi)/c;	
	
	# Computing the AC indices:
	if (AC == TRUE) {
	  ac <- AC.index(cl.orig, c, Sim.M);		
		res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, 
	             cluster=cl, orig.cluster=cl.orig, AC=ac);
	}														 
	else															 
	  res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, 
	              cluster=cl, orig.cluster=cl.orig);
	return(res);	
}

##################################################################
######### FUZZY K-MEANS CLUSTERING ###############################
##################################################################

##########################
# Multiple Random fuzzy-k-means clustering.
# Multiple Random fuzzy-k-means clusterings are computed using random projections of data.
# The crisp clustering is obtained by defuzzification via the nearest crisp clustering: each example is assigned to the
# cluster for which it has the largest membership. The base fuzzy algorithm used is \code{fanny} of the \code{cluster}
# package.
# It assumes that the label of the examples are integers starting from 1 to ncol(M).
# Several randomized maps may be used: RS, PMO, Normal and Achliopotas random projections
# Input:
# M : matrix of data: rows are variables and columns are examples
# dim : subspace dimension
# c : number of clusters
# pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# n : number of RS projections
# scale : if TRUE randomized projections are scaled (default)
# seed : numerical seed for the random generator
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# Output:
# a list  of the n clusterings obtained by the fuzzy-k-means algorithm clustering

Multiple.Random.fuzzy.kmeans <- function(M, dim, pmethod="PMO", c=3,  n=50, 
                                        scale=TRUE, seed=-1, distance="euclidean") {
  if (seed == -1)
	   seed <- round(runif(1,1,10000));
	set.seed(seed);
	cl <- list();
	for (i in 1:n) {
	  # A. selection of the randomized map
	  P.M <- switch(pmethod,
	                RS = random.subspace(d=dim, M, scaling=scale),
	                PMO = Plus.Minus.One.random.projection(d=dim, M, scaling=scale),
	                Norm = norm.random.projection(d=dim, M, scaling=scale),
	                Achlioptas = Achlioptas.random.projection(d=dim, M, scaling=scale),
									stop("Multiple.Random.kmeans: not supported random projection.", call.=FALSE));
	  if (distance == "euclidean")
		  d <- dist (t(P.M))
		else 
			d <- as.dist(1 - cor(P.M));
		r<- fanny(d, c, maxit = 1000);		
		cl[[i]] <- Transform.vector.to.list(r$clustering);
	}
	return(cl);								 
}


##########################
# Random fuzzy-k-means clustering and validity indices computation using random projections of data.
# This function applies the fuzzy-k-means clustering algorithm to the data and then computes stability indices for the
# obtained cluster using multiple random subspace projections. 
# It computes the validity indices for each cluster found in the original space, the overall validity index for the
# clsutering and (optionallly) the set of the AC indices.
# Different randomized maps (e.g. PMO, Achlioptas, Normal, Random Subspace projections) may be applied.
# It assumes that the label of the examples are integer starting from 1 to ncol(M).
# Note that the fuzzy-k-means algorithm strongly depends from the initial conditions. Hence choosing  different random seed
# we may obtain different results; setting seed=-1 (default) each time a different random seed is chosen.
# Input:
# M : matrix of data: rows are variables and columns are examples
# dim : subspace dimension
# c : number of cluster
# pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# n : number of RS projections
# scale : if TRUE randomized projections are scaled 
# seed : numerical seed for the random generator (if seed= -1 a different seed is generated each time)
# AC:   if TRUE (default) the AC indices are computed.
# Output:
# a list with six components: "validity", "overall.validity", "similarity.matrix", "dim", 
# "cluster", "orig.cluster":
# "validity" is a vector with the validity of each of the c clusters;
# "overall.validity" is the validity index of the overall cluster
# "similarity.matrix" is the pairwise similarity matrix between examples.
# "dimension" is the dimension of the random subspace dimension.
# "cluster" is the list of the n clustering obtained by multiple fuzzy-k-means clustering on the projected subspace
# "orig.cluster" is the list of the clusters in the original space
Random.fuzzy.kmeans.validity <- function(M, dim, pmethod="PMO", c=3, n=50, 
                                         scale=TRUE, seed=-1, AC=TRUE, distance="euclidean") {
  dim.Sim.M <- ncol(M);
	if (seed == -1)
	   seed <- round(runif(1,1,10000));
	# Computing the clusters in the original space	
	if (distance == "euclidean")
		d <- dist (t(M))
	else 
		d <- as.dist(1 - cor(M));
	r<- fanny(d, c, maxit = 1000);	
	cl.orig <- Transform.vector.to.list(r$clustering);
	
	
	# Perform multiple clusterings using randomized embeddings
	cl <- Multiple.Random.fuzzy.kmeans (M=M, dim=dim, pmethod=pmethod, c=c, n=n, 
	                                    scale=scale, seed=seed, distance=distance);
	
	# Update similarity matrix
	Sim.M <- Do.similarity.matrix(cl, dim.Sim.M);
	
	# Computing the list of validity measures
	# Computing the validity indices vi
	c <- length(cl.orig);
	vi <- Validity.indices(cl.orig, c, Sim.M);
	
	# Computing overall (average) validity of the clustering:
	ov.vi <- sum(vi)/c;	
	
	# Computing the AC indices:
	if (AC == TRUE) {
	  ac <- AC.index(cl.orig, c, Sim.M);		
		res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, 
	             cluster=cl, orig.cluster=cl.orig, AC=ac);
	}														 
	else															 
	  res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, 
	              cluster=cl, orig.cluster=cl.orig);
	return(res);	
}


##################################################################
######### PAM CLUSTERING #########################################
##################################################################

##########################
# Multiple Random PAM clustering.
# Multiple Random Partition Around Meodoids (PAM) clusterings are computed using random projections of data.
# The \code{pam} function of the package \code{cluster} is used as implementation of the base PAM algorithm.
# It assumes that the label of the examples are integers starting from 1 to ncol(M).
# Several randomized maps may be used: RS, PMO, Normal and Achliopotas random projections
# Input:
# M : matrix of data: rows are variables and columns are examples
# dim : subspace dimension
# c : number of clusters
# pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# n : number of RS projections
# scale : if TRUE randomized projections are scaled (default)
# seed : numerical seed for the random generator
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# Output:
# a list  of the n clusterings obtained by the PAM algorithm clustering

Multiple.Random.PAM <- function(M, dim, pmethod="PMO", c=3,  n=50, 
                                        scale=TRUE, seed=-1, distance="euclidean") {
  if (seed == -1)
	   seed <- round(runif(1,1,10000));
	set.seed(seed);
	cl <- list();
	for (i in 1:n) {
	  # A. selection of the randomized map
	  P.M <- switch(pmethod,
	                RS = random.subspace(d=dim, M, scaling=scale),
	                PMO = Plus.Minus.One.random.projection(d=dim, M, scaling=scale),
	                Norm = norm.random.projection(d=dim, M, scaling=scale),
	                Achlioptas = Achlioptas.random.projection(d=dim, M, scaling=scale),
									stop("Multiple.Random.kmeans: not supported random projection.", call.=FALSE));
	  if (distance == "euclidean")
		  d <- dist (t(P.M))
		else 
			d <- as.dist(1 - cor(P.M));
		cl.v <- pam (d,c,cluster.only=TRUE);	
		cl[[i]] <- Transform.vector.to.list(cl.v);
	}
	return(cl);								 
}


##########################
# Random PAM clustering and validity indices computation using random projections of data.
# This function applies the Prediction Arounf Medoids (PAM) clustering algorithm to the data and then computes 
# stability indices for the obtained cluster using multiple random subspace projections. 
# It computes the validity indices for each cluster found in the original space, the overall validity index for the
# clsutering and (optionallly) the set of the AC indices.
# Different randomized maps (e.g. PMO, Achlioptas, Normal, Random Subspace projections) may be applied.
# It assumes that the label of the examples are integer starting from 1 to ncol(M).
# The \code{pam} function of the package \code{cluster} is used as implementation of the base PAM algorithm.
# Input:
# M : matrix of data: rows are variables and columns are examples
# dim : subspace dimension
# c : number of cluster
# pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# n : number of RS projections
# scale : if TRUE randomized projections are scaled 
# seed : numerical seed for the random generator (if seed= -1 a different seed is generated each time)
# AC:   if TRUE (default) the AC indices are computed.
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# Output:
# a list with six components: "validity", "overall.validity", "similarity.matrix", "dim", 
# "cluster", "orig.cluster":
# "validity" is a vector with the validity of each of the c clusters;
# "overall.validity" is the validity index of the overall cluster
# "similarity.matrix" is the pairwise similarity matrix between examples.
# "dimension" is the dimension of the random subspace dimension.
# "cluster" is the list of the n clustering obtained by multiple fuzzy-k-means clustering on the projected subspace
# "orig.cluster" is the list of the clusters in the original space
Random.PAM.validity <- function(M, dim, pmethod="PMO", c=3, n=50, 
                                         scale=TRUE, seed=-1, AC=TRUE, distance="euclidean") {
  dim.Sim.M <- ncol(M);
	if (seed == -1)
	   seed <- round(runif(1,1,10000));
	# Computing the clusters in the original space	
	if (distance == "euclidean")
		d <- dist (t(M))
	else 
		d <- as.dist(1 - cor(M));
	cl.v <- pam (d,c,cluster.only=TRUE);	
	cl.orig <- Transform.vector.to.list(cl.v);
	
	
	# Perform multiple clusterings using randomized embeddings
	cl <- Multiple.Random.PAM (M=M, dim=dim, pmethod=pmethod, c=c, n=n, 
	                                    scale=scale, seed=seed, distance=distance);
	
	# Update similarity matrix
	Sim.M <- Do.similarity.matrix(cl, dim.Sim.M);
	
	# Computing the list of validity measures
	# Computing the validity indices vi
	c <- length(cl.orig);
	vi <- Validity.indices(cl.orig, c, Sim.M);
	
	# Computing overall (average) validity of the clustering:
	ov.vi <- sum(vi)/c;	
	
	# Computing the AC indices:
	if (AC == TRUE) {
	  ac <- AC.index(cl.orig, c, Sim.M);		
		res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, 
	             cluster=cl, orig.cluster=cl.orig, AC=ac);
	}														 
	else															 
	  res <- list (validity=vi, overall.validity=ov.vi, similarity.matrix=Sim.M, dimension=dim, 
	              cluster=cl, orig.cluster=cl.orig);
	return(res);	
}