##############################################################################################################
# mosclust.R
# February, March 2006
# Set of functions for implementing  model order selection for clustering algorithms
# May 2006: added functions to implement Bernstein hypothesis testing
# June 2006: modified functions for hypothesis testing
# August 2006: added functions to implement perturbations through injection of random noise
##############################################################################################################

#require(stats);
#require(cluster);
#if (require(clusterv) == FALSE)
#  stop("The clusterv library is needed to run mosclust");
source("rp.R");
source("rData.R");
source("clusterv.R");
source("clusterv2.R");

############################################################################################
# Hybrid testing based on stability methods for model order selection.
# The function accepts as input  a similarity matrix that stores the similarity measure between multiple pairs of clusterings 
# considering different number of clusters. Each row of the matrix corresponds to a k-clustering, each column to different repeated 
# measures.
# Note that the similarities can be computed using different clustering algorithms, different perturbations methods 
# (resampling techniques, random projections or noise-injection methods) and different similarity measures. 
# At first the Bernstein inequality-based test is performed. When the null hypothesis cannot be rejected, the chi square-based test 
# is performed on the remaining top ranked clusterings.
# The stability index for a given clustering is computed as the mean of the similarity indices between pairs of 
# k-clusterings obtained from the perturbed data. The similarity matrix given as input can be obtained from the functions
# do.similarity.resampling, do.similarity.projection, do.similarity.noise.
# Input:
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings.
# alpha : significance level (default 0.01)
# s0 : threshold for the similarity value (default 0.9)
# Output: 
# l : list with 6 elements:       "n.Bernstein.selected" : number of clusterings selected as  significant by the Bernstein test
#                                 "n.chi.sq.selected":  number of clusterings selected as  significant by chi square test. It may be 
#                                                       equal to 0 if Bernstein test selects only 1 clustering.
#                                 "Bernstein.res": data frame with the p-values obtained from Bernstein inequality           
#                                 "chi.sq.res" : data frame with the p-values obtained from chi square test. If through Bernstein 
#                                                inequality test only 1 clustering is significant this component is NULL 
#                                 "selected.res" : data frame with the results relative to the clusterings selected by the overall hybrid test
#                                 "F" : a list of cumulative distribution functions (of class ecdf) (not sorted).
Hybrid.testing <- function(sim.matrix, alpha=0.01, s0=0.9) {
	F <- compute.cumulative.multiple(sim.matrix);
	n <- length(F);
	n.chi.sq.selected <- 0;
	n.Bernstein.selected <- 0;
	
	d.Bernstein <- Bernstein.compute.pvalues(sim.matrix);
	d.Bernstein.selected <- Hypothesis.testing(d.Bernstein, alpha=alpha);
	n.Bernstein.selected <- nrow(d.Bernstein.selected);
	if (n.Bernstein.selected>1) {
	  selected.clusterings <- d.Bernstein.selected$ordered.clusterings - 1;
	  d.chi.sq <- Chi.square.compute.pvalues(sim.matrix[selected.clusterings,], s0 = s0);
		d.chi.sq$ordered.clusterings <- d.Bernstein.selected$ordered.clusterings; # to preserve the correct labelings
		d.chi.sq.selected <- Hypothesis.testing(d.chi.sq, alpha=alpha);
		n.chi.sq.selected <- nrow(d.chi.sq.selected);
		d.selected <- d.chi.sq.selected;
	}
	else {
	  d.selected <- d.Bernstein.selected;	
		d.chi.sq <- NULL;
	}
	
	l <- list(n.Bernstein.selected=n.Bernstein.selected, n.chi.sq.selected=n.chi.sq.selected, 
	          Bernstein.res = d.Bernstein, chi.sq.res = d.chi.sq, selected.res = d.selected, F=F);
	
	return(l);
}


############################################################################################
# Function to select significant clusterings from a given set of p-values.
#  For a given set of p-values returned from a given hypothesis testing, it returns the items for which there is no 
#  significant difference atalpha significance level (that is the items for which p > alpha).
# Input:
# d : data frame returned by Bernstein.compute.pvalues or Chi.square.compute.pvalues
# Output: a data frame corresponding to the clusterings significant at alpha level
# 

Hypothesis.testing <- function(d, alpha=0.01) {
	n.clusterings <- nrow(d);
	significant.difference <- TRUE;
	for (i in n.clusterings:2) {
	  if (d$p.value[i] > alpha) {
		  significant.difference <- FALSE;
		  break;
		} 	
	}
	if (significant.difference == FALSE) # there is no significant difference between the first and ith clustering
	  significant.clusters <- i
	else 
	  significant.clusters <- 1;
	return(d[1:significant.clusters,]);
}


############################################################################################
# Function to compute the stability indices and the p-values associated to a set of clusterings according to the chi-square test between multiple proportions.
# For a given similarity matrix a list of stability indices and corresponding p-values, sorted by descending order, from the most significant
# clustering to the least significant is given. A low p-value means that there is a significant difference between the 
# top sorted and the given k-clustering. The stability index for a given clustering is computed as the mean of the similarity indices between pairs of 
# k-clusterings obtained from the perturbed data. The similarity matrix given as input can be obtained from the functions
# do.similarity.resampling, do.similarity.projection, do.similarity.noise.
# Input:
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings.
#              Rows correspond to the different clusterings; columns to the n repeated clusterings for each number of clusters
#              Row 1 correspond to a 2-clustering, row 2 to a 3-clustering, ... row m to a m+1 clustering.
# s0 : threshold for the similarity value (default 0.9)
# Output: a data frame with 4 components:
#         ordered.clusterings : a vector with the number of clusters ordered from the most significant to the least significant
#         p.value : a vector with the corresponding p-values computed according to chi-square test between multiple proportions
#                   in descending order (their values correspond to the clusterings of the vector ordered.clusterings)
#         means : vector with the stability index (mean similarity) for each k-clustering
#         variance : vector with the variance of the similarity for each k-clustering

Chi.square.compute.pvalues <- function(sim.matrix, s0 = 0.9) {
	n.clusterings <- nrow(sim.matrix);
	n.measures <- ncol(sim.matrix);
	ordered.clusterings <- integer(n.clusterings);
	p.value <- numeric(n.clusterings);
	means <- numeric(n.clusterings);
	variance <- numeric(n.clusterings);
	
	means <- mean(as.data.frame(t(sim.matrix)));
	for (i in 1:n.clusterings)
	  variance[i] <- var(sim.matrix[i,]);
	sorted.means <- sort(means, decreasing=TRUE);
	sorted.indices <- order(means, decreasing=TRUE);
	means <- sorted.means;
	ordered.clusterings <- sorted.indices+1;
	variance <- variance[sorted.indices];
	ordered.sim.matrix <- sim.matrix[sorted.indices,];
	p.value[1] <- 1;
	for (k in 2:n.clusterings)
	  p.value[k] <- Compute.Chi.sq (ordered.sim.matrix[1:k,], s0);
	d <- data.frame(ordered.clusterings=ordered.clusterings, p.value=p.value, means=means, variance=variance);
	rownames(d) <- 1:n.clusterings;
	return(d);
}

############################################################################
# Function to evaluate if a set of similarity distributions significantly differ using the chi square test.
# The set of similarity values for a specific value of k (number of clusters) are subdivided in two groups
# choosing a threshold for the similarity value (default 0.9). Then different sets are compared using the chi squared test
# The number of degrees of freedom are equal to the number of the different sets minus 1.
# Input:
# M : matrix representing the similarity values for different number of clusters. Each row represents similarity values for a
#     number of clusters. Number of rows ==> how many numbers of clusters are considered; 
#     number of columns ==> cardinality of the similarity values for a given number of clusters
# s0 : threshold for the similarity value (default 0.9)
# Output:
# p.value :  p-value  associated to the significant difference (type I error)
Compute.Chi.sq <- function(M, s0 = 0.9) {
  n <- ncol(M);
	K <- nrow(M);
	x <- numeric(K);
	for (k in 1:K) 
	  for (j in 1:n)  
		  if (M[k,j] > s0)
			  x[k] <- x[k] + 1;	
	theta <- sum(x)/(n*K); # pooled estimate
	if ((theta == 0) || (theta==1)) # the proportions are equal
	  p.value = 1
	else {
	  chi.statistic <- sum((x-n*theta)^2)/(n*theta*(1-theta));
    p.value <- 1 - pchisq(chi.statistic,K-1);
	}
	return (p.value);
}


############################################################################################
# Function to compute the stability indices and the p-values associated to a set of clusterings according to Bernstein inequality.
# For a given similarity matrix a list of p-values, sorted by descending order, from the most significant
# clustering to the least significant is given. A low p-values means that there is a significant difference between the 
# top sorted and the given clustering. 
# The stability index for a given clustering is computed as the mean of the similarity indices between pairs of 
# k-clusterings obtained from the perturbed data. The similarity matrix given as input can be obtained from the functions
# do.similarity.resampling, do.similarity.projection, do.similarity.noise.
# Input:
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings.
#              Rows correspond to the different clusterings; columns to the n repeated clusterings for each number of clusters
#              Row 1 correspond to a 2-clustering, row 2 to a 3-clustering, ... row m to a m+1 clustering.
# Output: a data frame with 4 components:
#         ordered.clusterings : a vector with the number of clusters ordered from the most significant to the least significant
#         p.value : a vector with the corresponding p-values computed according to Bernstein inequality and Bonferroni correction
#                   in descending order (their values correspond to the clusterings of the vector ordered.clusterings)
#         means : vector with the mean similarity for each clustering
#         variance : vector with the variance of the similarity for each clustering
# 

Bernstein.compute.pvalues <- function(sim.matrix) {
	n.clusterings <- nrow(sim.matrix);
	n.measures <- ncol(sim.matrix);
	ordered.clusterings <- integer(n.clusterings);
	p.value <- numeric(n.clusterings);
	means <- numeric(n.clusterings);
	variance <- numeric(n.clusterings);
	
	means <- mean(as.data.frame(t(sim.matrix)));
	for (i in 1:n.clusterings)
	  variance[i] <- var(sim.matrix[i,]);
	sorted.means <- sort(means, decreasing=TRUE);
	sorted.indices <- order(means, decreasing=TRUE);
	means <- sorted.means;
	ordered.clusterings <- sorted.indices+1;
	variance <- variance[sorted.indices];
	p.value[1] <- 1;
	for (i in 2:n.clusterings)
	  p.value[i] <- Bernstein.p.value(n.measures, means[1]-means[i], variance[1]+variance[i]);
	for (i in (n.clusterings-1):2)
	  p.value[i] <- p.value[i] + p.value[i+1];
	p.value[p.value>1] <- 1;	
	d <- data.frame(ordered.clusterings=ordered.clusterings, p.value=p.value, means=means, variance=variance);
	rownames(d) <- 1:n.clusterings;
	return(d);
}

############################################################################################
# Function to compute the p-values associated to a set of clusterings according to Bernstein inequality, with
# the assumption that the random variables are independent
# For a given similarity matrix a list of p-values, sorted by descending order, from the most significant
# clustering to the least significant is given. A low p-values means that there is a significant difference between the 
# top sorted and the given clusterings.
# Input:
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings.
#              Rows correspond to the different clusterings; columns to the n repeated clusterings for each number of clusters
#              Row 1 correspond to a 2-clustering, row 2 to a 3-clustering, ... row m to a m+1 clustering.
# Output: a list with 4 components:
#         ordered.clusterings : a vector with the number of clusters ordered from the most significant to the least significant
#         p.value : a vector with the corresponding p-values computed according to Bernstein inequality and Bonferroni correction
#                   in descending order (their values correspond to the clusterings of the vector ordered.clusterings)
#         means : vector with the mean similarity for each clustering
#         variance : vector with the variance of the similarity for each clustering
# 

Bernstein.ind.compute.pvalues <- function(sim.matrix) {
	n.clusterings <- nrow(sim.matrix);
	n.measures <- ncol(sim.matrix);
	ordered.clusterings <- integer(n.clusterings);
	p.value <- numeric(n.clusterings);
	means <- numeric(n.clusterings);
	variance <- numeric(n.clusterings);
	
	means <- mean(as.data.frame(t(sim.matrix)));
	for (i in 1:n.clusterings)
	  variance[i] <- var(sim.matrix[i,]);
	sorted.means <- sort(means, decreasing=TRUE);
	sorted.indices <- order(means, decreasing=TRUE);
	means <- sorted.means;
	ordered.clusterings <- sorted.indices+1;
	variance <- variance[sorted.indices];
	p.value[1] <- 1;
	for (i in 2:n.clusterings)
	  p.value[i] <- Bernstein.p.value(n.measures, means[1]-means[i], variance[1]+variance[i]);
	for (i in 3:n.clusterings)
	  p.value[i] <- p.value[i] * p.value[i-1];	
	d <- data.frame(ordered.clusterings=ordered.clusterings, p.value=p.value, means=means, variance=variance);
	rownames(d) <- 1:n.clusterings;
	return(d);
}


############################################################################################
# Function to compute the p-value according to Bernstein inequality.
# Input:
# n : number of observations of the random variable
# Delta : difference between the means
# v : variance of the random variable
# Output: the upper bound to the probability
# 
Bernstein.p.value <- function(n, Delta, v) {
 if ((Delta==0) && (v==0))
   return (1)	
 else {
   p <- exp((-n*Delta^2)/(2*v+(2/3)*Delta));
   return(p);
 }
}


############################################################################################
# Function that computes iteratively sets of similarity indices using resampling techniques.
# This function may use different clustering algorithms and different similarity measures to compute similarity indices.
# Subsampling techniques are applied to perturb the data. 
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : if it is a vector of length 1, number of clusters from 2 to c are considered; otherwise are considered 
#     the number of clusters stored in the vector c.
# nsub : number of pairs of subsamples
# f : fraction of the data resampled without replacement
# s : similarity function to be used. It may be one of the following: 
#                                     - sFM (Fowlkes and Mallows)
#                                     - sJaccard (Jaccard)
#                                     - sM (matching coefficient)
#                                       (default Fowlkes and Mallows)
# alg.clust.sim : method that computes the similarity indices using subsampling techniques and a specific clustering algorithm.
#                  It may be one of the following: 
#                  - Hierarchical.sim.resampling (hierarchical clustering algorithm, default)
#                  - Kmeans.sim.resampling (c - mean algorithm)
#                  - PAM.sim.resampling (Prediction Around Medoid algorithm)
#                  - Fuzzy.kmeans.sim.resampling (Fuzzy c-mean)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : the agglomeration method to be used. This parameter is used only by the hierarchical clustering algorithm.
#          This should be one of the following:
#          "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of the hclust
#          method of the package stats.
# Output: 
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings performed on subsamples of the original data.
#              Number or rows equal to the length of c (number of clusters); number of columns      
#              equal to nsub, that is the number of subsamples considered for each number of clusters.
do.similarity.resampling <- function(X, c=2, nsub=100, f=0.8, s=sFM, alg.clust.sim=Hierarchical.sim.resampling, 
                                     distance="euclidean", hmethod="ward") {
 if (length(c) == 1)
   c <-2:c;
 num.clustering <- length(c);
 for (i in c)
   if (i<2) {
	   print("Number of clusters must be more than 2.");
		 return(0);
	 }
 sim.matrix <- matrix(numeric(num.clustering*nsub), nrow=num.clustering);
 j <- 0;
 for (num.clust in c) {
   j <- j + 1;
   sim.matrix[j,] <- alg.clust.sim (X, c = num.clust, nsub=nsub, f = f, s = s, distance=distance, hmethod=hmethod);
 }
 return(sim.matrix);
}


############################################################################################
# Function that computes iteratively sets of similarity indices using random projection techniques.
# This function may use different clustering algorithms and different similarity measures to compute similarity indices.
# Random projections techniques are applied to perturb the data. 
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : if it is a vector of length 1, number of clusters from 2 to c are considered; otherwise are considered 
#     the number of clusters stored in the vector c.
# nprojections : number of pairs of projected data
# dim : dimension of the projected data
#  pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection) (default)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# scale : if TRUE randomized projections are scaled (default)
# seed : numerical seed for the random generator
# s : similarity function to be used. It may be one of the following: 
#                                     - sFM (Fowlkes and Mallows)
#                                     - sJaccard (Jaccard)
#                                     - sM (matching coefficient)
#                                       (default Fowlkes and Mallows)
# alg.clust.sim : method that computes the similarity indices using randomized maps techniques and a specific clustering algorithm.
#                  It may be one of the following: 
#                  - Hierarchical.sim.projection (hierarchical clustering algorithm, default)
#                  - Kmeans.sim.projection (c - mean algorithm)
#                  - PAM.sim.projection (Prediction Around Medoid algorithm)
#                  - Fuzzy.kmeans.sim.projection (Fuzzy c-mean)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : the agglomeration method to be used if hierarchical clustering is chosen. This parameter is used only by the hierarchical clustering algorithm.
#          This should be one of the following:
#          "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of the hclust
#          method of the package stats.
# Output: 
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings performed on randomly projected data.
#              Number or rows equal to the length of c (number of clusters); number of columns
#              equal to nprojections, that is the number of projections considered for each number of clusters.
do.similarity.projection <- function(X, c=2, nprojections=100, dim=2, pmethod="PMO", scale=TRUE, seed=100, s=sFM, 
                                    alg.clust.sim=Hierarchical.sim.projection, distance="euclidean", hmethod="ward") {
 if (length(c) == 1)
   c <-2:c;
 num.clustering <- length(c);
 for (i in c)
   if (i<2) {
	   print("Number of clusters must be more than 2.");
		 return(0);
	 }
 sim.matrix <- matrix(numeric(num.clustering*nprojections), nrow=num.clustering);
 j <- 0;
 for (num.clust in c) {
   j <- j + 1;
   sim.matrix[j,] <- alg.clust.sim (X, c = num.clust, nprojections=nprojections, dim = dim,  
	                                  pmethod=pmethod, scale=scale, seed=seed, s = s, distance=distance, hmethod=hmethod);
 }
 return(sim.matrix);
}


############################################################################################
# Function that computes iteratively sets of similarity indices using injection of gaussian noise.
# This function may use different clustering algorithms and different similarity measures to compute similarity indices.
# Injection of gaussian noise  is applied to perturb the data. 
# The gaussian noise  added to the data has 0 mean and the standard deviation is estimated from the data (it is 
# set to a given percentile value of the standard deviations computed for each variable)
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : if it is a vector of length 1, number of clusters from 2 to c are considered; otherwise are considered 
#     the number of clusters stored in the vector c.
# nnoisy : number of pairs of noisy data
# perc : percentile of the standard deviations to be used for the added gaussian noise (default: median)
# seed : numerical seed for the random generator
# s : similarity function to be used. It may be one of the following: 
#                                     - sFM (Fowlkes and Mallows)
#                                     - sJaccard (Jaccard)
#                                     - sM (matching coefficient or Rand Index)
#                                       (default Fowlkes and Mallows)
# alg.clust.sim : method that computes the similarity indices using injections of gaussian noise and a specific clustering algorithm.
#                  It may be one of the following: 
#                  - Hierarchical.sim.noise (hierarchical clustering algorithm, default)
#                  - Kmeans.sim.noise (c - mean algorithm)
#                  - PAM.sim.noise (Prediction Around Medoid algorithm)
#                  - Fuzzy.kmeans.sim.noise (Fuzzy c-mean)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : the agglomeration method to be used if hierarchiccal clustering is chosen. This parameter is used only by the hierarchical clustering algorithm.
#          This should be one of the following:
#          "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of the hclust
#          method of the package stats.
# Output: 
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings performed on randomly projected data.
#              Number or rows equal to the length of c (number of clusters); number of columns
#              equal to nnoisy, that is the number of perturbations by noise for each number of clusters.
do.similarity.noise <- function(X, c=2, nnoisy=100, perc=0.5, seed=100, s=sFM, 
                                    alg.clust.sim=Hierarchical.sim.noise, distance="euclidean", hmethod="ward") {
 if (length(c) == 1)
   c <-2:c;
 num.clustering <- length(c);
 for (i in c)
   if (i<2) {
	   print("Number of clusters must be more than 2.");
		 return(0);
	 }
 set.seed(seed);
 sim.matrix <- matrix(numeric(num.clustering*nnoisy), nrow=num.clustering);
 j <- 0;
 for (num.clust in c) {
   j <- j + 1;
   sim.matrix[j,] <- alg.clust.sim (X, c = num.clust, nnoisy=nnoisy, perc=perc, s = s, distance=distance, hmethod=hmethod);
 }
 return(sim.matrix);
}


############################################################################################
# Function to compute the  cluster similarities for a given number of clusters using
# subsampling techniques and hierarchical clustering.
# A vector of similarity measures between pairs of clusterings perturbed with resampling techniques is computed for a given number of clusters. 
# The fraction of the resampled
# data (without replacement), the similarity measure and  the type of hierarchical clustering may be selected.
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nsub : number of subsamples
# f : fraction of the data resampled without replacement
# s : similarity function to be used. It may be one of the following: 
#                                     - sFM (Fowlkes and Mallows)
#                                     - sJaccard (Jaccard)
#                                     - sM (matching coefficient)
#                                       (default Fowlkes and Mallows)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : the agglomeration method to be used. This should be one of 
#          "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of he hclust
#          method of the package stats.
# Output:
# vector of the computed similarity measures (length equal to nsub)
Hierarchical.sim.resampling <- function(X, c = 2, nsub=100, f = 0.8, s = sFM, distance="euclidean", hmethod="ward") {

 n <- ncol(X);
 n.sub.ex <- ceiling(n * f);
 sim.vector <- numeric(nsub);
 
 for (i in 1:nsub)  {
   sub1 <- sample(n, n.sub.ex);
   Xsub1 <- X[,sub1];
	 colnames(Xsub1)<-sub1;
	 
	 if (distance == "euclidean")
		  d <- dist (t(Xsub1))
	 else 
			d <- as.dist(1 - cor(Xsub1));
	 tr1 <- hclust(d, method = hmethod);
	 plot(tr1, main="");
	 cl1 <- rect.hclust(tr1, k = c);
	 M1 <- Do.boolean.membership.matrix(cl1, n.sub.ex, sub1);
	 
	 sub2 <- sample(n, n.sub.ex);
   Xsub2 <- X[,sub2];
	 colnames(Xsub2)<-sub2;
	 
	 if (distance == "euclidean")
		  d <- dist (t(Xsub2))
	 else 
			d <- as.dist(1 - cor(Xsub2));
	 tr2 <- hclust(d, method = hmethod);
	 plot(tr2, main="");
	 cl2 <- rect.hclust(tr2, k = c);
	 M2 <- Do.boolean.membership.matrix(cl2, n.sub.ex, sub2);
	 # examples common two the two subsamples
	 sub.common <- Intersect(sub1,sub2);
	 
	 # extract from the membership matrices the rows and columns that 
	 # correspond to the examples common two the two subsamples
	 label.examples <- as.character(sub.common);
	 M1 <- M1[label.examples, label.examples];
	 M2 <- M2[label.examples, label.examples];
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################################################
# Function to compute the  cluster similarities for a given number of clusters using
# randomized maps and hierarchical clustering.
# A vector of similarity measures between pairs of clusterings perturbed with random projections is computed for a given number of clusters. 
# The dimension of the projected
# data, the type of randomized map, the similarity measure and  the type of hierarchical clustering may be selected.
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nprojections : number of pairs of projected data
# dim : dimension of the projected data
#  pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# scale : if TRUE randomized projections are scaled (default)
# seed : numerical seed for the random generator
# s : similarity function to be used. It may be one of the following: 
#                                     - sFM (Fowlkes and Mallows)
#                                     - sJaccard (Jaccard)
#                                     - sM (matching coefficient)
#                                       (default Fowlkes and Mallows)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : the agglomeration method to be used. This should be one of 
#          "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of he hclust
#          method of the package stats.
# Output:
# vector of the computed similarity measures (length equal to nprojections)
Hierarchical.sim.projection <- function(X, c = 2, nprojections=100, dim=2, pmethod="RS", 
                                        scale=TRUE, seed=100, s=sFM, distance="euclidean", hmethod="ward") {

 n <- ncol(X);
 sim.vector <- numeric(nprojections);
 cl <- Multiple.Random.hclustering (X, dim=dim, pmethod=pmethod, c=c, hmethod=hmethod, n=nprojections*2, scale=scale, 
                                    distance=distance, seed=seed);
 for (i in 1:nprojections)  {   
	 cl1 <- cl[[i]];
	 M1 <- Do.boolean.membership.matrix(cl1, n, 1:n);
	 cl2 <- cl[[i+nprojections]];
	 M2 <- Do.boolean.membership.matrix(cl2, n, 1:n);
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################################################
# Function to compute  cluster similarities for a given number of clusters using
# injections of random noise and hierarchical clustering.
# A vector of similarity measures between pairs of clusterings perturbed with random noise is computed for a given number of clusters. 
# The variance of the added gaussian noise, estimated from the data as the perc percentile of the standard deviations of the input variables,
# the percentile itsel, the similarity measure and  the type of hierarchical clustering may be selected. 
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nnoisy : number of pairs of noisy data
# perc : percentile of the standard deviations to be used for the added gaussian noise (def. 0.5)
# s : similarity function to be used. It may be one of the following: 
#                                     - sFM (Fowlkes and Mallows)
#                                     - sJaccard (Jaccard)
#                                     - sM (matching coefficient)
#                                       (default Fowlkes and Mallows)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : the agglomeration method to be used. This should be one of 
#          "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of he hclust
#          method of the package stats.
# Output:
# vector of the computed similarity measures (length equal to nnoisy)
Hierarchical.sim.noise <- function(X, c = 2, nnoisy=100, perc=0.5, s = sFM, distance="euclidean", hmethod="ward") {

 n <- ncol(X);
 sim.vector <- numeric(nnoisy);
 
 for (i in 1:nnoisy)  {
   X1 <- perturb.by.noise(X, perc);	 
	 if (distance == "euclidean")
		  d <- dist (t(X1))
	 else 
			d <- as.dist(1 - cor(X1));
	 tr1 <- hclust(d, method = hmethod);
	 plot(tr1, main="");
	 cl1 <- rect.hclust(tr1, k = c);
	 M1 <- Do.boolean.membership.matrix(cl1, n, 1:n);
	 
	 X2 <- perturb.by.noise(X, perc);	 
	 if (distance == "euclidean")
		  d <- dist (t(X2))
	 else 
			d <- as.dist(1 - cor(X2));
	 tr2 <- hclust(d, method = hmethod);
	 plot(tr2, main="");
	 cl2 <- rect.hclust(tr2, k = c);
	 M2 <- Do.boolean.membership.matrix(cl2, n, 1:n);
	 
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################
######### K-MEANS CLUSTERING ###############################
############################################################


############################################################################################
# Function to compute cluster similarities for a given number of clusters using
# Kmeans clustering.
# A vector of similarity measures between pairs of clusterings perturbed with resampling techniques is computed for a given number of clusters. 
# The fraction of the resampled
# data (without replacement) and the similarity measure can be selected.
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nsub : number of subsamples
# f : fraction of the data resampled without replacement
# s : similarity function to be used (default Fowlkes and Mallows similarity measure
# hmethod : parameter used for internal compatibility.
# distance : actually only the euclidean distance is available "euclidean" (default)
# Output:
# vector of the computed similarity measures (length equal to nsub)
Kmeans.sim.resampling <- function(X, c = 2, nsub=100, f = 0.8, s = sFM, distance="euclidean", hmethod=NULL) {

 n <- ncol(X);
 n.sub.ex <- ceiling(n * f);
 sim.vector <- numeric(nsub);
 
 for (i in 1:nsub)  {
   sub1 <- sample(n, n.sub.ex);
   Xsub1 <- X[,sub1];
	 colnames(Xsub1)<-sub1;
	 
	 r<-kmeans(t(Xsub1), c=c, iter.max = 1000);
	 cl1 <- Transform.vector.to.list(r$cluster);
	 M1 <- Do.boolean.membership.matrix(cl1, n.sub.ex, sub1);
	 
	 sub2 <- sample(n, n.sub.ex);
   Xsub2 <- X[,sub2];
	 colnames(Xsub2)<-sub2;
	 
	 r<-kmeans(t(Xsub2), c=c, iter.max = 1000);
	 cl2 <- Transform.vector.to.list(r$cluster);
	 M2 <- Do.boolean.membership.matrix(cl2, n.sub.ex, sub2);
	
	 # examples common two the two subsamples
	 sub.common <- Intersect(sub1,sub2);
	 
	 # extract from the membership matrices the rows and columns that 
	 # correspond to the examples common two the two subsamples
	 label.examples <- as.character(sub.common);
	 M1 <- M1[label.examples, label.examples];
	 M2 <- M2[label.examples, label.examples];
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################################################
# Function to compute cluster similarities for a given numer of clusters using
# randomized maps and kmeans clustering.
# A vector of similarity measures between pairs of clusterings perturbed with random projections is computed for a given number of clusters. 
# The dimension of the projected
# data, the type of randomized map, the similarity measure can be selected.
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nprojections : number of pairs of projected data
# dim : dimension of the projected data
#  pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# scale : if TRUE randomized projections are scaled (default)
# seed : numerical seed for the random generator
# s : similarity function to be used (default Fowlkes and Mallows similarity measure)
# distance : actually only the euclidean distance is available "euclidean" (default)
# hmethod : parameter used for internal compatibility.
# Output:
# vector of the computed similarity measures (length equal to nsub)
Kmeans.sim.projection <- function(X, c = 2, nprojections=100, dim=2, pmethod="PMO", 
                                        scale=TRUE, seed=100, s=sFM, distance="euclidean", hmethod="ward") {
 n <- ncol(X);
 sim.vector <- numeric(nprojections);
 cl <- Multiple.Random.kmeans (X, dim=dim, pmethod=pmethod, c=c, n=nprojections*2, scale=scale, seed=seed);
 for (i in 1:nprojections)  {   
	 cl1 <- cl[[i]];
	 M1 <- Do.boolean.membership.matrix(cl1, n, 1:n);
	 cl2 <- cl[[i+nprojections]];
	 M2 <- Do.boolean.membership.matrix(cl2, n, 1:n);
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################################################
# Function to compute the cluster similarities for a given number of clusters using
# injections of random noise and Kmeans clustering.
# A vector of similarity measures between pairs of clusterings perturbed with random noise is computed for a given number of clusters. 
# The variance of the added gaussian noise, estimated from the data as the perc percentile of the standard deviations of the input variables,
# the percentile itself and the similarity measure can be selected. 
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nnoisy : number of pairs of noisy data
# perc : percentile of the standard deviations to be used for the added gaussian noise (def. 0.5)
# s : similarity function to be used (default Fowlkes and Mallows similarity measure)
# distance : actually only the euclidean distance is available "euclidean" (default)
# hmethod : parameter used for internal compatibility.
# Output:
# vector of the computed similarity measures (length equal to nnoisy)
Kmeans.sim.noise <- function(X, c = 2, nnoisy=100, perc=0.5, s = sFM, distance="euclidean", hmethod=NULL) {

 n <- ncol(X);
 sim.vector <- numeric(nnoisy);
 
 for (i in 1:nnoisy)  {
   X1 <- perturb.by.noise(X, perc);	 	 
	 r<-kmeans(t(X1), c=c, iter.max = 1000);
	 cl1 <- Transform.vector.to.list(r$cluster);
	 M1 <- Do.boolean.membership.matrix(cl1, n, 1:n);
	 
	 X2 <- perturb.by.noise(X, perc);		 
	 r<-kmeans(t(X2), c=c, iter.max = 1000);
	 cl2 <- Transform.vector.to.list(r$cluster);
	 M2 <- Do.boolean.membership.matrix(cl2,  n, 1:n);

	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


########################################################
######### PAM CLUSTERING ###############################
########################################################

############################################################################################
# Function to compute  cluster similarities for a given numer of clusters using
# PAM clustering.
# A vector of similarity measures between pairs of clusterings perturbed with resampling techniques is computed for a given number of clusters. 
# The fraction of the resampled
# data (without replacement) and the similarity measure can be selected.
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nsub : number of subsamples
# f : fraction of the data resampled without replacement
# s : similarity function to be used (default Fowlkes and Mallows similarity measure)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : parameter used for internal compatibility.
# Output:
# vector of the computed similarity measures (length equal to nsub)
PAM.sim.resampling <- function(X, c = 2, nsub=100, f = 0.8, s = sFM, distance="euclidean", hmethod=NULL) {

 n <- ncol(X);
 n.sub.ex <- ceiling(n * f);
 sim.vector <- numeric(0);
 
 for (i in 1:nsub)  {
   sub1 <- sample(n, n.sub.ex);
   Xsub1 <- X[,sub1];
	 colnames(Xsub1)<-sub1;
	 
	 if (distance == "euclidean")
		  d <- dist (t(Xsub1))
	 else 
			d <- as.dist(1 - cor(Xsub1));
	 r <- pam (d,c,cluster.only=TRUE);
	 cl1 <- Transform.vector.to.list(r);
	 M1 <- Do.boolean.membership.matrix(cl1, n.sub.ex, sub1);
	 
	 sub2 <- sample(n, n.sub.ex);
   Xsub2 <- X[,sub2];
	 colnames(Xsub2)<-sub2;
	 
	 if (distance == "euclidean")
		  d <- dist (t(Xsub2))
	 else 
			d <- as.dist(1 - cor(Xsub2));
	 r <- pam (d,c,cluster.only=TRUE);
	 cl2 <- Transform.vector.to.list(r);
	 M2 <- Do.boolean.membership.matrix(cl2, n.sub.ex, sub2);
	
	 # examples common two the two subsamples
	 sub.common <- Intersect(sub1,sub2);
	 
	 # extract from the membership matrices the rows and columns that 
	 # correspond to the examples common two the two subsamples
	 label.examples <- as.character(sub.common);
	 M1 <- M1[label.examples, label.examples];
	 M2 <- M2[label.examples, label.examples];
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}

############################################################################################
# Function to compute  cluster similarities for a given numer of clusters using
# randomized maps and PAM clustering.
# A vector of similarity measures between pairs of clusterings perturbed with random projections is computed for a given number of clusters. 
# The dimension of the projected
# data, the type of randomized map, the similarity measure can be selected.
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nprojections : number of pairs of projected data
# dim : dimension of the projected data
#  pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# scale : if TRUE randomized projections are scaled (default)
# seed : numerical seed for the random generator
# s : similarity function to be used (default Fowlkes and Mallows similarity measure)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : parameter used for internal compatibility.
# Output:
# vector of the computed similarity measures (length equal to nsub)
PAM.sim.projection <- function(X, c = 2, nprojections=100, dim=2, pmethod="PMO", 
                                        scale=TRUE, seed=100, s=sFM, distance="euclidean", hmethod=NULL) {
 n <- ncol(X);
 sim.vector <- numeric(nprojections);
 cl <- Multiple.Random.PAM (X, dim=dim, pmethod=pmethod, c=c, n=nprojections*2, scale=scale, seed=seed, distance=distance);
 for (i in 1:nprojections)  {   
	 cl1 <- cl[[i]];
	 M1 <- Do.boolean.membership.matrix(cl1, n, 1:n);
	 cl2 <- cl[[i+nprojections]];
	 M2 <- Do.boolean.membership.matrix(cl2, n, 1:n);
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################################################
# Function to compute  cluster similarities for a given number of clusters using
# injections of random noise and PAM clustering.
# A vector of similarity measures between pairs of clusterings perturbed with random noise is computed for a given number of clusters. 
# The variance of the added gaussian noise, estimated from the data as the perc percentile of the standard deviations of the input variables,
# the percentile itself and the similarity measure can be selected. 
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nnoisy : number of pairs of noisy data
# perc : percentile of the standard deviations to be used for the added gaussian noise (def. 0.5)
# s : similarity function to be used (default Fowlkes and Mallows similarity measure)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : parameter used for internal compatibility.
# Output:
# vector of the computed similarity measures (length equal to nsub)
PAM.sim.noise <- function(X, c = 2, nnoisy=100, perc=0.5, s = sFM, distance="euclidean", hmethod=NULL) {

 n <- ncol(X);
 sim.vector <- numeric(nnoisy);
  
 for (i in 1:nnoisy)  {
   X1 <- perturb.by.noise(X, perc);	 
	 if (distance == "euclidean")
		  d <- dist (t(X1))
	 else 
			d <- as.dist(1 - cor(X1));
	 r <- pam (d,c,cluster.only=TRUE);
	 cl1 <- Transform.vector.to.list(r);
	 M1 <- Do.boolean.membership.matrix(cl1, n, 1:n);
	 
	 X2 <- perturb.by.noise(X, perc);	 
	 if (distance == "euclidean")
		  d <- dist (t(X2))
	 else 
			d <- as.dist(1 - cor(X2));
	 r <- pam (d,c,cluster.only=TRUE);
	 cl2 <- Transform.vector.to.list(r);
	 M2 <- Do.boolean.membership.matrix(cl2, n, 1:n);
	
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################
######### FUZZY K-MEANS CLUSTERING #########################
############################################################

############################################################################################
# Function to compute  cluster similarities for a given numer of clusters using
# Fuzzy kmeans clustering.
# A vector of similarity measures between pairs of clusterings perturbed with resampling techniques is computed for a given number of clusters. 
# The fraction of the resampled
# data (without replacement) and the similarity measure can be selected.
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nsub : number of subsamples
# f : fraction of the data resampled without replacement
# s : similarity function to be used (default Fowlkes and Mallows similarity measure
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : parameter used for internal compatibility.
# Output:
# vector of the computed similarity measures (length equal to nsub)
Fuzzy.kmeans.sim.resampling <- function(X, c = 2, nsub=100, f = 0.8, s = sFM, distance="euclidean", hmethod=NULL) {

 n <- ncol(X);
 n.sub.ex <- ceiling(n * f);
 sim.vector <- numeric(nsub);
 
 for (i in 1:nsub)  {
   sub1 <- sample(n, n.sub.ex);
   Xsub1 <- X[,sub1];
	 colnames(Xsub1)<-sub1;
	 
	 if (distance == "euclidean")
		  d <- dist (t(Xsub1))
	 else 
			d <- as.dist(1 - cor(Xsub1));
	 r<- fanny(d, c, maxit = 1000);		
	 cl1 <- Transform.vector.to.list(r$clustering);
	 M1 <- Do.boolean.membership.matrix(cl1, n.sub.ex, sub1);
	 
	 sub2 <- sample(n, n.sub.ex);
   Xsub2 <- X[,sub2];
	 colnames(Xsub2)<-sub2;
	 
	 if (distance == "euclidean")
		  d <- dist (t(Xsub2))
	 else 
			d <- as.dist(1 - cor(Xsub2));
	 r<- fanny(d, c, maxit = 1000);		
	 cl2 <- Transform.vector.to.list(r$clustering);
	 M2 <- Do.boolean.membership.matrix(cl2, n.sub.ex, sub2);
	
	 # examples common two the two subsamples
	 sub.common <- Intersect(sub1,sub2);
	 
	 # extract from the membership matrices the rows and columns that 
	 # correspond to the examples common two the two subsamples
	 label.examples <- as.character(sub.common);
	 M1 <- M1[label.examples, label.examples];
	 M2 <- M2[label.examples, label.examples];
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}

############################################################################################
# Function to compute  cluster similarities for a given number of clusters using
# randomized maps and Fuzzy kmeans clustering.
# A vector of similarity measures between pairs of clusterings perturbed with random projections is computed for a given number of clusters. 
# The dimension of the projected
# data, the type of randomized map, the similarity measure can be selected.
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nprojections : number of pairs of projected data
# dim : dimension of the projected data
#  pmethod : projection method. It must be one of the following: 
# "RS" (random subspace projection)
# "PMO" (Plus Minus One random projection)
# "Norm" (normal random projection)
# "Achlioptas" (Achlioptas random projection)
# scale : if TRUE randomized projections are scaled (default)
# seed : numerical seed for the random generator
# s : similarity function to be used (default Fowlkes and Mallows similarity measure)
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : parameter used for internal compatibility.
# Output:
# vector of the computed similarity measures (length equal to nsub)
Fuzzy.kmeans.sim.projection <- function(X, c = 2, nprojections=100, dim=2, pmethod="PMO", 
                                        scale=TRUE, seed=100, s=sFM, distance="euclidean", hmethod=NULL) {
 n <- ncol(X);
 sim.vector <- numeric(nprojections);
 cl <- Multiple.Random.fuzzy.kmeans (X, dim=dim, pmethod=pmethod, c=c, n=nprojections*2, scale=scale, seed=seed, distance=distance);
 for (i in 1:nprojections)  {   
	 cl1 <- cl[[i]];
	 M1 <- Do.boolean.membership.matrix(cl1, n, 1:n);
	 cl2 <- cl[[i+nprojections]];
	 M2 <- Do.boolean.membership.matrix(cl2, n, 1:n);
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################################################
# Function to compute  cluster similarities for a given number of clusters using
# injections of random noise and  Fuzzy kmeans clustering.
# A vector of similarity measures between pairs of clusterings perturbed with random noise is computed for a given number of clusters. 
# The variance of the added gaussian noise, estimated from the data as the perc percentile of the standard deviations of the input variables,
# the percentile itself and the similarity measure can be selected. 
# Input:
# X : matrix of data (variables are rows, examples columns)
# c : number of clusters
# nnoisy : number of pairs of noisy data
# perc : percentile of the standard deviations to be used for the added gaussian noise (def. 0.5)
# s : similarity function to be used (default Fowlkes and Mallows similarity measure
# distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation)
# hmethod : parameter used for internal compatibility.
# Output:
# vector of the computed similarity measures (length equal to nsub)
Fuzzy.kmeans.sim.noise <- function(X, c = 2, nnoisy=100, perc=0.5, s = sFM, distance="euclidean", hmethod=NULL) {

 n <- ncol(X);
 sim.vector <- numeric(nnoisy);
  
 for (i in 1:nnoisy)  {
   X1 <- perturb.by.noise(X, perc);	 	 
	 if (distance == "euclidean")
		  d <- dist (t(X1))
	 else 
			d <- as.dist(1 - cor(X1));
	 r<- fanny(d, c, maxit = 1000);		
	 cl1 <- Transform.vector.to.list(r$clustering);
	 M1 <- Do.boolean.membership.matrix(cl1, n, 1:n);
	 
	 X2 <- perturb.by.noise(X, perc);	 	 
	 if (distance == "euclidean")
		  d <- dist (t(X2))
	 else 
			d <- as.dist(1 - cor(X2));
	 r<- fanny(d, c, maxit = 1000);		
	 cl2 <- Transform.vector.to.list(r$clustering);
	 M2 <- Do.boolean.membership.matrix(cl2, n, 1:n);
	
	 sim.vector[i] <- s(M1,M2);
 }
 return(sim.vector);
}


############################################################################################
# Other functions for specific clustering algorithms may be implemented simply using the general
# scheme of the previous functions.
############################################################################################


##############################################################################
# Function to compute and build up a pairwise boolean membership matrix.
# This function may be used also with clusterings that do not define strictly a partition of the data and using
# variable number of clusters for each clustering.
# Input:
# cl : a clustering (list of vectors defining a clustering)
# dim.M : dimension of the similarity matrix (number of examples)
# examplelabels : labels of the examples drawn from the original data. 
# Output:
# M : the pairwise boolean membership matrix. whose elements are 1 if 2 examples fall in the same cluster, 0 otherwise
Do.boolean.membership.matrix <- function(cl, dim.M, examplelabels) {
  M <- matrix(integer(dim.M*dim.M), nrow=dim.M);
	colnames(M) <- rownames(M) <- examplelabels;
	singletons <- integer(dim.M);  
		 c <- length(cl); # number of clusters 
	   for (j in 1:c) {
		   n.ex <- length(cl[[j]]);
			if (n.ex == 1)
			  singletons[cl[[j]][1]] <- 1
			else {
			    for (x1 in 1:(n.ex-1)) {
			       for (x2 in (x1+1):n.ex) {
			         x <- cl[[j]][x1];
				       y <- cl[[j]][x2];
			         M[x,y] <- 1;
			       }
			    }
			 }
		 }
	for (x1 in 1:(dim.M-1)) 
		for (x2 in (x1+1):dim.M) 
		  M[x2,x1] <- M[x1,x2];
	for (x in 1:(dim.M)) 
	   M[x,x] <- singletons[x];
	return(M);
}


##############################################################################
# Function to compute the intersection between elements of two vectors
# Having as input two sets of elements represented by two vectors, the intersection between the two sets is performed and
#	the corresponding vector is returned.
# Input:
# sub1 : first vector
# sub2 : second vector
# Output:
# sub.int : vector that stores the elements common to the two input vectors
Intersect <- function(sub1,sub2) {
  
	sub.int <- integer(0);
	n1 <- length(sub1);
	n2 <- length(sub2);
	k <- 0;
		
	for(i in 1:n1)
	  for(j in 1:n2) 
		  if (sub1[i] == sub2[j]) {
			  k <- k + 1;
			  sub.int[k] <- sub1[i];
				break;
			}		
	return(sub.int);
}


##############################################################################
# Function to compute the Fowlkes and Mallows similarity between two clusterings
# Input:
# M1 : boolean memberhip matrix representing the first clustering
# M2 : boolean membership matrix representing the second clustering
# Output:
# res : similarity measure between the two clusterings according to Fowlkes and Mallows 
sFM <- function(M1,M2) {
  res <- sum(M1*M2) / sqrt(sum(M1*M1) * sum(M2*M2));
	return(res);
}

##############################################################################
# Function to compute the Jaccard similarity coefficient between two clusterings
# Input:
# M1 : boolean membership matrix representing the first clustering
# M2 : boolean membership matrix representing the second clustering
# Output:
# res : similarity measure between the two clusterings according to the Jaccard coefficient 
sJaccard <- function(M1,M2) {
  res <- sum(M1*M2) / (sum(M1*M1) + sum(M2*M2) - sum(M1*M2));
	return(res);
}

##############################################################################
# Function to compute the Matching coefficient between two clusterings
# Input:
# M1 : boolean membership matrix representing the first clustering
# M2 : boolean membership matrix representing the second clustering
# Output:
# res : similarity measure between the two clusterings according to the Matching  coefficient 
sM <- function(M1,M2) {
  n <- nrow(M1);
  res <- 1 - sum((M1-M2)^2)/n^2;
	return(res);
}


##############################################################################
# Function to plot the histogram of similarities for a given number of clusters.
# Input:
# sim : vector of similarity values
# nbins : number of the bins of the histogram
plot.hist.similarity <- function(sim, nbins=25) {
  num.samples <- length(sim);
	minimum <- min(sim);
	if (minimum> 0.5)
	  minimum <- 0.5;
  hist(sim, breaks=seq(minimum,1,length=nbins), ylim=c(0,num.samples), main="", xlab="Similarity");
}


##############################################################################
# Function to plot multiple histograms of similarities for different number of clusters.
# This function outputs a set of histograms (one for each number of clusters, i.e. one for each row of
# of the matrix S of similarity values).
# Input:
# S : Matrix of similarity values
# n.col : number of columns in the grid of the histograms
# nbins : number of the bins of the histogram
# labels : label of the histograms. If NULL (default) the number of clusters from 2 to nrow(S)+1 are used.
plot.multiple.hist.similarity <- function(S, n.col=3, labels=NULL, nbins=25) {
  num.samples <- ncol(S);
	n.graph <- nrow(S);
	n.row <- ceiling(n.graph/n.col);
	op <- par(mfrow=c(n.row,n.col));
	minimum <- min(S);
	if (minimum> 0.5)
	  minimum <- 0.5;
	if (is.null(labels))
	  lab <- 2:(n.graph+1)
	else
	  lab <- labels;
	for (i in 1:n.graph) {
	  title <- paste("k = ", lab[i]);
    hist(S[i,], breaks=seq(minimum,1,length=nbins), ylim=c(0,num.samples), main="", xlab=title, ylab="");
	}
	par(op);
}


##############################################################################
# Function to compute the empirical cumulative distribution function (ECDF) of the similarity measures for different number of clusters
# Input:
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings.
#              Each row corresponds to a different number of clusters; number of columns
#              equal to  the number of subsamples considered for each number of clusters.
# Output:
# list.F : a list of function of class ecdf 
compute.cumulative.multiple <- function(sim.matrix) {
  list.F <- list();
  n <- nrow(sim.matrix);
	for (i in 1:n) 
	  list.F[[i]] <- ecdf(sim.matrix[i,]);
	return(list.F);
}

##############################################################################
# Function that return the values of the empirical cumulative distribution
# Input:
# F : Function of class ecdf  that stores the discrete values of the cumulative distribution
# Output:
# l : a list with two elements: the "x" element stores a vector with the values of the random variable for
#     which the cumulative distribution needs to be computed; the "y" element stores a vector with the corresponding
#     values of the cumulative distribution (i.e. y = F(x)).
cumulative.values <- function(F) {
  e <- environment(F);
	l <- list(x=e$x, y=e$y);
	return(l);
}

##############################################################################
# Function to plot the ECDF for the similarity values
# Input:
# F : Function of class ecdf  that stores the discrete values of the cumulative distribution
# Output:
# plot of the cumulative distribution
plot.cumulative <- function(F) {
  plot(F, xlim=c(0,1), ylim=c(0,1), xlab="similarities", ylab="cumulative distribution", main="", 
	     do.points=FALSE, verticals=TRUE);
}

##############################################################################
# Function to plot a set of ECDF for the similarity values.
# Graphs of the empirical cumulative distributions corresponding to different number of clusters are plotted, using different
# patterns and/or different colors for each graph. Up to 15  ecdf graphs can be plotted simultaneously.
# Input:
# list.F :  a list of function of class ecdf 
# labels : vector of the labels associated to the CDF. If  NULL (default), then a vector of labels from 2 to lenght(list.F)+1
#          is used.
# min.x : minimum value to be plotted for similarities. If -1 (default) the minimum of the similarity value is obtained
#         from list.F
# colors : if TRUE (default) different colors are used to plot the different ECDF, otherwise black lines are used
# Output:
# plot of the cumulative distributions
plot.cumulative.multiple <- function(list.F, labels=NULL, min.x=-1, colors=TRUE) {
  MAX_INTEGRALS=15; 
  minimum=1;
	num.F <- length(list.F);
	if (num.F > 15)
	  num.F <- 15;
	if (min.x == -1) {
	  for (i in 1:num.F) {
	    l <-  cumulative.values (list.F[[i]]);
	    localmin <- min(l$x);
	    if (localmin<minimum)
	      minimum <- localmin;
	  }
	}
	else
	  minimum = min.x;
	if (is.null(labels)) {
	  lab <- 2:(num.F+1);
		lab <- paste("k=",lab,sep="")
	}
	else 
	  lab <- paste("k=",labels,sep="");
	# making line patterns (up  to 15)	
	lpat <- c("81","44", "13", "1343", "73", "2262", "4424", "3113", "2626", "6622", "1242", "6222", "2424", "8828", "6246");
  plot(list.F[1],xlim=c(minimum,1), ylim=c(0,1), xlab="similarities", ylab="cumulative distribution", main="", type="n");
	if (colors == TRUE) {
	  for (i in 1:num.F) {
	    lines(list.F[[i]],lty=lpat[i], do.points=FALSE, verticals=TRUE, col.hor=i, col.vert=i); 
	    # text(l$x[1],l$y[1],pos=1,labels=lab[i]);
	  }
	  # making legend
	  legend(minimum,1,legend=lab,lty=lpat[1:num.F], col=1:num.F);
	}
  else {
	  for (i in 1:num.F) {
		  lines(list.F[[i]],lty=lpat[i], do.points=FALSE, verticals=TRUE); 
	    #l <-  cumulative.values (list.F[[i]]);
	    # lines(l$x,l$y,lty=lpat[i]); 
	  }
	  # making legend
	  legend(minimum,1,legend=lab,lty=lpat[1:num.F]);	
	}
}


##############################################################################
# Function to plot p-values. 
# The p-values corresponding to different k-clusterings according to different hypothesis testing are plotted. A horizontal line corresponding to
# a given alpha value (significance) is also plotted. In the x axis is represented the number of clusters sorted according to the value of the stability index,
# and in the y axis the corresponding p-value is plotted.
# In this way the results of different tests of hypothesis can be compared.
# Input:
# l :     a list of lists. Each component list represents a different test of hypothesis, and it has in turn 4 components:
#         ordered.clusterings : a vector with the number of clusters ordered from the most significant to the least significant
#         p.value : a vector with the corresponding p-values computed according to chi-square test between multiple proportions
#                   in descending order (their values correspond to the clusterings of the vector ordered.clusterings)
#         means : vector with the mean similarity (stability index) for each clustering
#         variance : vector with the variance of the similarity for each clustering
# alpha : straight line of the alpha value
# legendy : ordinate of the legend.If 0 (def.) no legend is plotted.
# leg_label: labels of the legend. If NULL (def.) the text "test 1, test 2, ... test n" for the n tests is printed.
#            Otherwise it is a vector of characters specifying the text to be printed
# colors: if TRUE (def.) lines are printed with colors, otherwise using only different line patterns.
# Output:
# plot of the p-values
plot.pvalues <- function(l,alpha=1e-02,legendy=0, leg_label=NULL, colors=TRUE) { 
  n.test <- length(l);
	xlab <- l[[1]]$ordered.clusterings;
	n.clusterings <- length(l[[1]]$ordered.clusterings);
	xlab <- paste("k=",xlab,sep="");
	lpat <- c("81","44", "13", "1343", "73", "2262", "4424", "3113", "2626", "6622", "1242", "6222");
	
	exist_zero <- FALSE;
	x<- numeric(n.test*n.clusterings);
	z <- numeric(0); miny = 1;
	for (i in 1:n.test) {
	  for (j in 1:n.clusterings) {
		  if (l[[i]]$p.value[j] == 0) {
			  z <- c (z,i,j);
				exist_zero <- TRUE;
				next;
			}
			else if (l[[i]]$p.value[j]<miny)
			  miny <- l[[i]]$p.value[j];
		}
	}
	if (exist_zero) {
	  miny <- miny/10;
	  for (i in seq(1,length(z),by=2))
		  l[[z[i]]]$p.value[z[i+1]] <- miny;
	}
	
  plot(1:n.clusterings, l[[1]]$p.value,xlim=c(1,n.clusterings), ylim=c(miny,2), xlab="clusterings", 
	ylab="log p-value", main="", type="n", lty=lpat[1], log="y", axes=FALSE);
	axis(1, at=1:n.clusterings, labels=xlab);
	axis(2);
	#if (n.test>1)
	if (colors == TRUE) 
	  for (i in 1:n.test) {
	    lines(1:n.clusterings,l[[i]]$p.value,lty=lpat[i], type="o", col=i) 
	    # text(l$x[1],l$y[1],pos=1,labels=lab[i]);
		}
	else 
	  for (i in 1:n.test) {
	    lines(1:n.clusterings,l[[i]]$p.value,lty=lpat[i], type="o"); 
	    # text(l$x[1],l$y[1],pos=1,labels=lab[i]);
	  }	 
	lines(1:n.clusterings,rep(alpha,n.clusterings),lty=2, type="l");
	text(ceiling(n.clusterings/2),alpha,labels=paste("alpha=", alpha));
	# making legend
	if (is.null(leg_label))
	   leg_label <- paste("test ",1:n.test,sep="");
	if (legendy!=0)
	  if (colors == TRUE)
	    legend(1,legendy, legend=leg_label,lty=lpat[1:n.test], col=1:n.test)
		else
		  legend(1,legendy, legend=leg_label,lty=lpat[1:n.test]);
}


##############################################################################
# Function to compute the integral of the ECDF similarity values
# Input:
# F : Function of class ecdf  that stores the discrete values of the empirical cumulative distribution
# subdivisions : maximum number of subintervals used by the integration process
# Output:
# v : value of the estimate integral
compute.integral <- function(F, subdivisions=1000) {
  l <- integrate(F, 0, 1, subdivisions=subdivisions);
	return(l$value);
}


##############################################################################
# Function to compute the integral of a ECDF using only the similarity matrix.
# Each integral of the ECDF of the similarity is computed using the similarity measures between the clusterings
# Input:
# sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters
#              and multiple clusterings performed on subsamples of the original data.
#              Number or rows equal to the different numbers of clusters considered; number of columns      
#              equal to  the number of subsamples considered for each number of clusters.
# Output:
# res : vector of the values of the estimate integrals
compute.integral.from.similarity <- function(sim.matrix) {
  return(as.vector(1 - mean(as.data.frame(t(sim.matrix)))));
}


##############################################################################
# Function to generate a data set perturbed by noise.
# Gaussian noise is added to the data. The mean of the gaussian noise is 0 and the standard deviation is estimated from the data.
# The standard deviation of the variables of the data is computed and the given perc (percentile) is obtained. 
# The standard deviation of the gaussian added noise is set to the obtained percentile value (see the McShane et al. paper for details).
# Input:
# X : matrix of data (variables are rows, examples columns)
# perc : percentile of the standard deviation (def: 0.5)
# Output:
# X.noisy :  matrix of perturbed data (variables are rows, examples columns)
perturb.by.noise <- function(X, perc = 0.5) {
  
	if ((perc<=0)||(perc>1))
	  stop("Percentile must be between 0 and 1");  
  n.var <- nrow(X);
	n.samples <- ncol(X);
	
	sdev <- sort(sd(t(X)));
	noise.sd <- sdev[ceiling(n.var*perc)];
	
	X.noisy <- matrix(rnorm(n.var*n.samples, sd=noise.sd), nrow=n.var);
	
	X.noisy <- X + X.noisy;
	
	return(X.noisy);
}