############################################################################################################## # mosclust.R # February, March 2006 # Set of functions for implementing model order selection for clustering algorithms # May 2006: added functions to implement Bernstein hypothesis testing # June 2006: modified functions for hypothesis testing # August 2006: added functions to implement perturbations through injection of random noise ############################################################################################################## #require(stats); #require(cluster); #if (require(clusterv) == FALSE) # stop("The clusterv library is needed to run mosclust"); source("rp.R"); source("rData.R"); source("clusterv.R"); source("clusterv2.R"); ############################################################################################ # Hybrid testing based on stability methods for model order selection. # The function accepts as input a similarity matrix that stores the similarity measure between multiple pairs of clusterings # considering different number of clusters. Each row of the matrix corresponds to a k-clustering, each column to different repeated # measures. # Note that the similarities can be computed using different clustering algorithms, different perturbations methods # (resampling techniques, random projections or noise-injection methods) and different similarity measures. # At first the Bernstein inequality-based test is performed. When the null hypothesis cannot be rejected, the chi square-based test # is performed on the remaining top ranked clusterings. # The stability index for a given clustering is computed as the mean of the similarity indices between pairs of # k-clusterings obtained from the perturbed data. The similarity matrix given as input can be obtained from the functions # do.similarity.resampling, do.similarity.projection, do.similarity.noise. # Input: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings. # alpha : significance level (default 0.01) # s0 : threshold for the similarity value (default 0.9) # Output: # l : list with 6 elements: "n.Bernstein.selected" : number of clusterings selected as significant by the Bernstein test # "n.chi.sq.selected": number of clusterings selected as significant by chi square test. It may be # equal to 0 if Bernstein test selects only 1 clustering. # "Bernstein.res": data frame with the p-values obtained from Bernstein inequality # "chi.sq.res" : data frame with the p-values obtained from chi square test. If through Bernstein # inequality test only 1 clustering is significant this component is NULL # "selected.res" : data frame with the results relative to the clusterings selected by the overall hybrid test # "F" : a list of cumulative distribution functions (of class ecdf) (not sorted). Hybrid.testing <- function(sim.matrix, alpha=0.01, s0=0.9) { F <- compute.cumulative.multiple(sim.matrix); n <- length(F); n.chi.sq.selected <- 0; n.Bernstein.selected <- 0; d.Bernstein <- Bernstein.compute.pvalues(sim.matrix); d.Bernstein.selected <- Hypothesis.testing(d.Bernstein, alpha=alpha); n.Bernstein.selected <- nrow(d.Bernstein.selected); if (n.Bernstein.selected>1) { selected.clusterings <- d.Bernstein.selected$ordered.clusterings - 1; d.chi.sq <- Chi.square.compute.pvalues(sim.matrix[selected.clusterings,], s0 = s0); d.chi.sq$ordered.clusterings <- d.Bernstein.selected$ordered.clusterings; # to preserve the correct labelings d.chi.sq.selected <- Hypothesis.testing(d.chi.sq, alpha=alpha); n.chi.sq.selected <- nrow(d.chi.sq.selected); d.selected <- d.chi.sq.selected; } else { d.selected <- d.Bernstein.selected; d.chi.sq <- NULL; } l <- list(n.Bernstein.selected=n.Bernstein.selected, n.chi.sq.selected=n.chi.sq.selected, Bernstein.res = d.Bernstein, chi.sq.res = d.chi.sq, selected.res = d.selected, F=F); return(l); } ############################################################################################ # Function to select significant clusterings from a given set of p-values. # For a given set of p-values returned from a given hypothesis testing, it returns the items for which there is no # significant difference atalpha significance level (that is the items for which p > alpha). # Input: # d : data frame returned by Bernstein.compute.pvalues or Chi.square.compute.pvalues # Output: a data frame corresponding to the clusterings significant at alpha level # Hypothesis.testing <- function(d, alpha=0.01) { n.clusterings <- nrow(d); significant.difference <- TRUE; for (i in n.clusterings:2) { if (d$p.value[i] > alpha) { significant.difference <- FALSE; break; } } if (significant.difference == FALSE) # there is no significant difference between the first and ith clustering significant.clusters <- i else significant.clusters <- 1; return(d[1:significant.clusters,]); } ############################################################################################ # Function to compute the stability indices and the p-values associated to a set of clusterings according to the chi-square test between multiple proportions. # For a given similarity matrix a list of stability indices and corresponding p-values, sorted by descending order, from the most significant # clustering to the least significant is given. A low p-value means that there is a significant difference between the # top sorted and the given k-clustering. The stability index for a given clustering is computed as the mean of the similarity indices between pairs of # k-clusterings obtained from the perturbed data. The similarity matrix given as input can be obtained from the functions # do.similarity.resampling, do.similarity.projection, do.similarity.noise. # Input: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings. # Rows correspond to the different clusterings; columns to the n repeated clusterings for each number of clusters # Row 1 correspond to a 2-clustering, row 2 to a 3-clustering, ... row m to a m+1 clustering. # s0 : threshold for the similarity value (default 0.9) # Output: a data frame with 4 components: # ordered.clusterings : a vector with the number of clusters ordered from the most significant to the least significant # p.value : a vector with the corresponding p-values computed according to chi-square test between multiple proportions # in descending order (their values correspond to the clusterings of the vector ordered.clusterings) # means : vector with the stability index (mean similarity) for each k-clustering # variance : vector with the variance of the similarity for each k-clustering Chi.square.compute.pvalues <- function(sim.matrix, s0 = 0.9) { n.clusterings <- nrow(sim.matrix); n.measures <- ncol(sim.matrix); ordered.clusterings <- integer(n.clusterings); p.value <- numeric(n.clusterings); means <- numeric(n.clusterings); variance <- numeric(n.clusterings); means <- mean(as.data.frame(t(sim.matrix))); for (i in 1:n.clusterings) variance[i] <- var(sim.matrix[i,]); sorted.means <- sort(means, decreasing=TRUE); sorted.indices <- order(means, decreasing=TRUE); means <- sorted.means; ordered.clusterings <- sorted.indices+1; variance <- variance[sorted.indices]; ordered.sim.matrix <- sim.matrix[sorted.indices,]; p.value[1] <- 1; for (k in 2:n.clusterings) p.value[k] <- Compute.Chi.sq (ordered.sim.matrix[1:k,], s0); d <- data.frame(ordered.clusterings=ordered.clusterings, p.value=p.value, means=means, variance=variance); rownames(d) <- 1:n.clusterings; return(d); } ############################################################################ # Function to evaluate if a set of similarity distributions significantly differ using the chi square test. # The set of similarity values for a specific value of k (number of clusters) are subdivided in two groups # choosing a threshold for the similarity value (default 0.9). Then different sets are compared using the chi squared test # The number of degrees of freedom are equal to the number of the different sets minus 1. # Input: # M : matrix representing the similarity values for different number of clusters. Each row represents similarity values for a # number of clusters. Number of rows ==> how many numbers of clusters are considered; # number of columns ==> cardinality of the similarity values for a given number of clusters # s0 : threshold for the similarity value (default 0.9) # Output: # p.value : p-value associated to the significant difference (type I error) Compute.Chi.sq <- function(M, s0 = 0.9) { n <- ncol(M); K <- nrow(M); x <- numeric(K); for (k in 1:K) for (j in 1:n) if (M[k,j] > s0) x[k] <- x[k] + 1; theta <- sum(x)/(n*K); # pooled estimate if ((theta == 0) || (theta==1)) # the proportions are equal p.value = 1 else { chi.statistic <- sum((x-n*theta)^2)/(n*theta*(1-theta)); p.value <- 1 - pchisq(chi.statistic,K-1); } return (p.value); } ############################################################################################ # Function to compute the stability indices and the p-values associated to a set of clusterings according to Bernstein inequality. # For a given similarity matrix a list of p-values, sorted by descending order, from the most significant # clustering to the least significant is given. A low p-values means that there is a significant difference between the # top sorted and the given clustering. # The stability index for a given clustering is computed as the mean of the similarity indices between pairs of # k-clusterings obtained from the perturbed data. The similarity matrix given as input can be obtained from the functions # do.similarity.resampling, do.similarity.projection, do.similarity.noise. # Input: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings. # Rows correspond to the different clusterings; columns to the n repeated clusterings for each number of clusters # Row 1 correspond to a 2-clustering, row 2 to a 3-clustering, ... row m to a m+1 clustering. # Output: a data frame with 4 components: # ordered.clusterings : a vector with the number of clusters ordered from the most significant to the least significant # p.value : a vector with the corresponding p-values computed according to Bernstein inequality and Bonferroni correction # in descending order (their values correspond to the clusterings of the vector ordered.clusterings) # means : vector with the mean similarity for each clustering # variance : vector with the variance of the similarity for each clustering # Bernstein.compute.pvalues <- function(sim.matrix) { n.clusterings <- nrow(sim.matrix); n.measures <- ncol(sim.matrix); ordered.clusterings <- integer(n.clusterings); p.value <- numeric(n.clusterings); means <- numeric(n.clusterings); variance <- numeric(n.clusterings); means <- mean(as.data.frame(t(sim.matrix))); for (i in 1:n.clusterings) variance[i] <- var(sim.matrix[i,]); sorted.means <- sort(means, decreasing=TRUE); sorted.indices <- order(means, decreasing=TRUE); means <- sorted.means; ordered.clusterings <- sorted.indices+1; variance <- variance[sorted.indices]; p.value[1] <- 1; for (i in 2:n.clusterings) p.value[i] <- Bernstein.p.value(n.measures, means[1]-means[i], variance[1]+variance[i]); for (i in (n.clusterings-1):2) p.value[i] <- p.value[i] + p.value[i+1]; p.value[p.value>1] <- 1; d <- data.frame(ordered.clusterings=ordered.clusterings, p.value=p.value, means=means, variance=variance); rownames(d) <- 1:n.clusterings; return(d); } ############################################################################################ # Function to compute the p-values associated to a set of clusterings according to Bernstein inequality, with # the assumption that the random variables are independent # For a given similarity matrix a list of p-values, sorted by descending order, from the most significant # clustering to the least significant is given. A low p-values means that there is a significant difference between the # top sorted and the given clusterings. # Input: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings. # Rows correspond to the different clusterings; columns to the n repeated clusterings for each number of clusters # Row 1 correspond to a 2-clustering, row 2 to a 3-clustering, ... row m to a m+1 clustering. # Output: a list with 4 components: # ordered.clusterings : a vector with the number of clusters ordered from the most significant to the least significant # p.value : a vector with the corresponding p-values computed according to Bernstein inequality and Bonferroni correction # in descending order (their values correspond to the clusterings of the vector ordered.clusterings) # means : vector with the mean similarity for each clustering # variance : vector with the variance of the similarity for each clustering # Bernstein.ind.compute.pvalues <- function(sim.matrix) { n.clusterings <- nrow(sim.matrix); n.measures <- ncol(sim.matrix); ordered.clusterings <- integer(n.clusterings); p.value <- numeric(n.clusterings); means <- numeric(n.clusterings); variance <- numeric(n.clusterings); means <- mean(as.data.frame(t(sim.matrix))); for (i in 1:n.clusterings) variance[i] <- var(sim.matrix[i,]); sorted.means <- sort(means, decreasing=TRUE); sorted.indices <- order(means, decreasing=TRUE); means <- sorted.means; ordered.clusterings <- sorted.indices+1; variance <- variance[sorted.indices]; p.value[1] <- 1; for (i in 2:n.clusterings) p.value[i] <- Bernstein.p.value(n.measures, means[1]-means[i], variance[1]+variance[i]); for (i in 3:n.clusterings) p.value[i] <- p.value[i] * p.value[i-1]; d <- data.frame(ordered.clusterings=ordered.clusterings, p.value=p.value, means=means, variance=variance); rownames(d) <- 1:n.clusterings; return(d); } ############################################################################################ # Function to compute the p-value according to Bernstein inequality. # Input: # n : number of observations of the random variable # Delta : difference between the means # v : variance of the random variable # Output: the upper bound to the probability # Bernstein.p.value <- function(n, Delta, v) { if ((Delta==0) && (v==0)) return (1) else { p <- exp((-n*Delta^2)/(2*v+(2/3)*Delta)); return(p); } } ############################################################################################ # Function that computes iteratively sets of similarity indices using resampling techniques. # This function may use different clustering algorithms and different similarity measures to compute similarity indices. # Subsampling techniques are applied to perturb the data. # Input: # X : matrix of data (variables are rows, examples columns) # c : if it is a vector of length 1, number of clusters from 2 to c are considered; otherwise are considered # the number of clusters stored in the vector c. # nsub : number of pairs of subsamples # f : fraction of the data resampled without replacement # s : similarity function to be used. It may be one of the following: # - sFM (Fowlkes and Mallows) # - sJaccard (Jaccard) # - sM (matching coefficient) # (default Fowlkes and Mallows) # alg.clust.sim : method that computes the similarity indices using subsampling techniques and a specific clustering algorithm. # It may be one of the following: # - Hierarchical.sim.resampling (hierarchical clustering algorithm, default) # - Kmeans.sim.resampling (c - mean algorithm) # - PAM.sim.resampling (Prediction Around Medoid algorithm) # - Fuzzy.kmeans.sim.resampling (Fuzzy c-mean) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : the agglomeration method to be used. This parameter is used only by the hierarchical clustering algorithm. # This should be one of the following: # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of the hclust # method of the package stats. # Output: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings performed on subsamples of the original data. # Number or rows equal to the length of c (number of clusters); number of columns # equal to nsub, that is the number of subsamples considered for each number of clusters. do.similarity.resampling <- function(X, c=2, nsub=100, f=0.8, s=sFM, alg.clust.sim=Hierarchical.sim.resampling, distance="euclidean", hmethod="ward") { if (length(c) == 1) c <-2:c; num.clustering <- length(c); for (i in c) if (i<2) { print("Number of clusters must be more than 2."); return(0); } sim.matrix <- matrix(numeric(num.clustering*nsub), nrow=num.clustering); j <- 0; for (num.clust in c) { j <- j + 1; sim.matrix[j,] <- alg.clust.sim (X, c = num.clust, nsub=nsub, f = f, s = s, distance=distance, hmethod=hmethod); } return(sim.matrix); } ############################################################################################ # Function that computes iteratively sets of similarity indices using random projection techniques. # This function may use different clustering algorithms and different similarity measures to compute similarity indices. # Random projections techniques are applied to perturb the data. # Input: # X : matrix of data (variables are rows, examples columns) # c : if it is a vector of length 1, number of clusters from 2 to c are considered; otherwise are considered # the number of clusters stored in the vector c. # nprojections : number of pairs of projected data # dim : dimension of the projected data # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) (default) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # scale : if TRUE randomized projections are scaled (default) # seed : numerical seed for the random generator # s : similarity function to be used. It may be one of the following: # - sFM (Fowlkes and Mallows) # - sJaccard (Jaccard) # - sM (matching coefficient) # (default Fowlkes and Mallows) # alg.clust.sim : method that computes the similarity indices using randomized maps techniques and a specific clustering algorithm. # It may be one of the following: # - Hierarchical.sim.projection (hierarchical clustering algorithm, default) # - Kmeans.sim.projection (c - mean algorithm) # - PAM.sim.projection (Prediction Around Medoid algorithm) # - Fuzzy.kmeans.sim.projection (Fuzzy c-mean) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : the agglomeration method to be used if hierarchical clustering is chosen. This parameter is used only by the hierarchical clustering algorithm. # This should be one of the following: # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of the hclust # method of the package stats. # Output: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings performed on randomly projected data. # Number or rows equal to the length of c (number of clusters); number of columns # equal to nprojections, that is the number of projections considered for each number of clusters. do.similarity.projection <- function(X, c=2, nprojections=100, dim=2, pmethod="PMO", scale=TRUE, seed=100, s=sFM, alg.clust.sim=Hierarchical.sim.projection, distance="euclidean", hmethod="ward") { if (length(c) == 1) c <-2:c; num.clustering <- length(c); for (i in c) if (i<2) { print("Number of clusters must be more than 2."); return(0); } sim.matrix <- matrix(numeric(num.clustering*nprojections), nrow=num.clustering); j <- 0; for (num.clust in c) { j <- j + 1; sim.matrix[j,] <- alg.clust.sim (X, c = num.clust, nprojections=nprojections, dim = dim, pmethod=pmethod, scale=scale, seed=seed, s = s, distance=distance, hmethod=hmethod); } return(sim.matrix); } ############################################################################################ # Function that computes iteratively sets of similarity indices using injection of gaussian noise. # This function may use different clustering algorithms and different similarity measures to compute similarity indices. # Injection of gaussian noise is applied to perturb the data. # The gaussian noise added to the data has 0 mean and the standard deviation is estimated from the data (it is # set to a given percentile value of the standard deviations computed for each variable) # Input: # X : matrix of data (variables are rows, examples columns) # c : if it is a vector of length 1, number of clusters from 2 to c are considered; otherwise are considered # the number of clusters stored in the vector c. # nnoisy : number of pairs of noisy data # perc : percentile of the standard deviations to be used for the added gaussian noise (default: median) # seed : numerical seed for the random generator # s : similarity function to be used. It may be one of the following: # - sFM (Fowlkes and Mallows) # - sJaccard (Jaccard) # - sM (matching coefficient or Rand Index) # (default Fowlkes and Mallows) # alg.clust.sim : method that computes the similarity indices using injections of gaussian noise and a specific clustering algorithm. # It may be one of the following: # - Hierarchical.sim.noise (hierarchical clustering algorithm, default) # - Kmeans.sim.noise (c - mean algorithm) # - PAM.sim.noise (Prediction Around Medoid algorithm) # - Fuzzy.kmeans.sim.noise (Fuzzy c-mean) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : the agglomeration method to be used if hierarchiccal clustering is chosen. This parameter is used only by the hierarchical clustering algorithm. # This should be one of the following: # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of the hclust # method of the package stats. # Output: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings performed on randomly projected data. # Number or rows equal to the length of c (number of clusters); number of columns # equal to nnoisy, that is the number of perturbations by noise for each number of clusters. do.similarity.noise <- function(X, c=2, nnoisy=100, perc=0.5, seed=100, s=sFM, alg.clust.sim=Hierarchical.sim.noise, distance="euclidean", hmethod="ward") { if (length(c) == 1) c <-2:c; num.clustering <- length(c); for (i in c) if (i<2) { print("Number of clusters must be more than 2."); return(0); } set.seed(seed); sim.matrix <- matrix(numeric(num.clustering*nnoisy), nrow=num.clustering); j <- 0; for (num.clust in c) { j <- j + 1; sim.matrix[j,] <- alg.clust.sim (X, c = num.clust, nnoisy=nnoisy, perc=perc, s = s, distance=distance, hmethod=hmethod); } return(sim.matrix); } ############################################################################################ # Function to compute the cluster similarities for a given number of clusters using # subsampling techniques and hierarchical clustering. # A vector of similarity measures between pairs of clusterings perturbed with resampling techniques is computed for a given number of clusters. # The fraction of the resampled # data (without replacement), the similarity measure and the type of hierarchical clustering may be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nsub : number of subsamples # f : fraction of the data resampled without replacement # s : similarity function to be used. It may be one of the following: # - sFM (Fowlkes and Mallows) # - sJaccard (Jaccard) # - sM (matching coefficient) # (default Fowlkes and Mallows) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : the agglomeration method to be used. This should be one of # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of he hclust # method of the package stats. # Output: # vector of the computed similarity measures (length equal to nsub) Hierarchical.sim.resampling <- function(X, c = 2, nsub=100, f = 0.8, s = sFM, distance="euclidean", hmethod="ward") { n <- ncol(X); n.sub.ex <- ceiling(n * f); sim.vector <- numeric(nsub); for (i in 1:nsub) { sub1 <- sample(n, n.sub.ex); Xsub1 <- X[,sub1]; colnames(Xsub1)<-sub1; if (distance == "euclidean") d <- dist (t(Xsub1)) else d <- as.dist(1 - cor(Xsub1)); tr1 <- hclust(d, method = hmethod); plot(tr1, main=""); cl1 <- rect.hclust(tr1, k = c); M1 <- Do.boolean.membership.matrix(cl1, n.sub.ex, sub1); sub2 <- sample(n, n.sub.ex); Xsub2 <- X[,sub2]; colnames(Xsub2)<-sub2; if (distance == "euclidean") d <- dist (t(Xsub2)) else d <- as.dist(1 - cor(Xsub2)); tr2 <- hclust(d, method = hmethod); plot(tr2, main=""); cl2 <- rect.hclust(tr2, k = c); M2 <- Do.boolean.membership.matrix(cl2, n.sub.ex, sub2); # examples common two the two subsamples sub.common <- Intersect(sub1,sub2); # extract from the membership matrices the rows and columns that # correspond to the examples common two the two subsamples label.examples <- as.character(sub.common); M1 <- M1[label.examples, label.examples]; M2 <- M2[label.examples, label.examples]; sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Function to compute the cluster similarities for a given number of clusters using # randomized maps and hierarchical clustering. # A vector of similarity measures between pairs of clusterings perturbed with random projections is computed for a given number of clusters. # The dimension of the projected # data, the type of randomized map, the similarity measure and the type of hierarchical clustering may be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nprojections : number of pairs of projected data # dim : dimension of the projected data # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # scale : if TRUE randomized projections are scaled (default) # seed : numerical seed for the random generator # s : similarity function to be used. It may be one of the following: # - sFM (Fowlkes and Mallows) # - sJaccard (Jaccard) # - sM (matching coefficient) # (default Fowlkes and Mallows) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : the agglomeration method to be used. This should be one of # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of he hclust # method of the package stats. # Output: # vector of the computed similarity measures (length equal to nprojections) Hierarchical.sim.projection <- function(X, c = 2, nprojections=100, dim=2, pmethod="RS", scale=TRUE, seed=100, s=sFM, distance="euclidean", hmethod="ward") { n <- ncol(X); sim.vector <- numeric(nprojections); cl <- Multiple.Random.hclustering (X, dim=dim, pmethod=pmethod, c=c, hmethod=hmethod, n=nprojections*2, scale=scale, distance=distance, seed=seed); for (i in 1:nprojections) { cl1 <- cl[[i]]; M1 <- Do.boolean.membership.matrix(cl1, n, 1:n); cl2 <- cl[[i+nprojections]]; M2 <- Do.boolean.membership.matrix(cl2, n, 1:n); sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Function to compute cluster similarities for a given number of clusters using # injections of random noise and hierarchical clustering. # A vector of similarity measures between pairs of clusterings perturbed with random noise is computed for a given number of clusters. # The variance of the added gaussian noise, estimated from the data as the perc percentile of the standard deviations of the input variables, # the percentile itsel, the similarity measure and the type of hierarchical clustering may be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nnoisy : number of pairs of noisy data # perc : percentile of the standard deviations to be used for the added gaussian noise (def. 0.5) # s : similarity function to be used. It may be one of the following: # - sFM (Fowlkes and Mallows) # - sJaccard (Jaccard) # - sM (matching coefficient) # (default Fowlkes and Mallows) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : the agglomeration method to be used. This should be one of # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid", according of he hclust # method of the package stats. # Output: # vector of the computed similarity measures (length equal to nnoisy) Hierarchical.sim.noise <- function(X, c = 2, nnoisy=100, perc=0.5, s = sFM, distance="euclidean", hmethod="ward") { n <- ncol(X); sim.vector <- numeric(nnoisy); for (i in 1:nnoisy) { X1 <- perturb.by.noise(X, perc); if (distance == "euclidean") d <- dist (t(X1)) else d <- as.dist(1 - cor(X1)); tr1 <- hclust(d, method = hmethod); plot(tr1, main=""); cl1 <- rect.hclust(tr1, k = c); M1 <- Do.boolean.membership.matrix(cl1, n, 1:n); X2 <- perturb.by.noise(X, perc); if (distance == "euclidean") d <- dist (t(X2)) else d <- as.dist(1 - cor(X2)); tr2 <- hclust(d, method = hmethod); plot(tr2, main=""); cl2 <- rect.hclust(tr2, k = c); M2 <- Do.boolean.membership.matrix(cl2, n, 1:n); sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################ ######### K-MEANS CLUSTERING ############################### ############################################################ ############################################################################################ # Function to compute cluster similarities for a given number of clusters using # Kmeans clustering. # A vector of similarity measures between pairs of clusterings perturbed with resampling techniques is computed for a given number of clusters. # The fraction of the resampled # data (without replacement) and the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nsub : number of subsamples # f : fraction of the data resampled without replacement # s : similarity function to be used (default Fowlkes and Mallows similarity measure # hmethod : parameter used for internal compatibility. # distance : actually only the euclidean distance is available "euclidean" (default) # Output: # vector of the computed similarity measures (length equal to nsub) Kmeans.sim.resampling <- function(X, c = 2, nsub=100, f = 0.8, s = sFM, distance="euclidean", hmethod=NULL) { n <- ncol(X); n.sub.ex <- ceiling(n * f); sim.vector <- numeric(nsub); for (i in 1:nsub) { sub1 <- sample(n, n.sub.ex); Xsub1 <- X[,sub1]; colnames(Xsub1)<-sub1; r<-kmeans(t(Xsub1), c=c, iter.max = 1000); cl1 <- Transform.vector.to.list(r$cluster); M1 <- Do.boolean.membership.matrix(cl1, n.sub.ex, sub1); sub2 <- sample(n, n.sub.ex); Xsub2 <- X[,sub2]; colnames(Xsub2)<-sub2; r<-kmeans(t(Xsub2), c=c, iter.max = 1000); cl2 <- Transform.vector.to.list(r$cluster); M2 <- Do.boolean.membership.matrix(cl2, n.sub.ex, sub2); # examples common two the two subsamples sub.common <- Intersect(sub1,sub2); # extract from the membership matrices the rows and columns that # correspond to the examples common two the two subsamples label.examples <- as.character(sub.common); M1 <- M1[label.examples, label.examples]; M2 <- M2[label.examples, label.examples]; sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Function to compute cluster similarities for a given numer of clusters using # randomized maps and kmeans clustering. # A vector of similarity measures between pairs of clusterings perturbed with random projections is computed for a given number of clusters. # The dimension of the projected # data, the type of randomized map, the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nprojections : number of pairs of projected data # dim : dimension of the projected data # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # scale : if TRUE randomized projections are scaled (default) # seed : numerical seed for the random generator # s : similarity function to be used (default Fowlkes and Mallows similarity measure) # distance : actually only the euclidean distance is available "euclidean" (default) # hmethod : parameter used for internal compatibility. # Output: # vector of the computed similarity measures (length equal to nsub) Kmeans.sim.projection <- function(X, c = 2, nprojections=100, dim=2, pmethod="PMO", scale=TRUE, seed=100, s=sFM, distance="euclidean", hmethod="ward") { n <- ncol(X); sim.vector <- numeric(nprojections); cl <- Multiple.Random.kmeans (X, dim=dim, pmethod=pmethod, c=c, n=nprojections*2, scale=scale, seed=seed); for (i in 1:nprojections) { cl1 <- cl[[i]]; M1 <- Do.boolean.membership.matrix(cl1, n, 1:n); cl2 <- cl[[i+nprojections]]; M2 <- Do.boolean.membership.matrix(cl2, n, 1:n); sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Function to compute the cluster similarities for a given number of clusters using # injections of random noise and Kmeans clustering. # A vector of similarity measures between pairs of clusterings perturbed with random noise is computed for a given number of clusters. # The variance of the added gaussian noise, estimated from the data as the perc percentile of the standard deviations of the input variables, # the percentile itself and the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nnoisy : number of pairs of noisy data # perc : percentile of the standard deviations to be used for the added gaussian noise (def. 0.5) # s : similarity function to be used (default Fowlkes and Mallows similarity measure) # distance : actually only the euclidean distance is available "euclidean" (default) # hmethod : parameter used for internal compatibility. # Output: # vector of the computed similarity measures (length equal to nnoisy) Kmeans.sim.noise <- function(X, c = 2, nnoisy=100, perc=0.5, s = sFM, distance="euclidean", hmethod=NULL) { n <- ncol(X); sim.vector <- numeric(nnoisy); for (i in 1:nnoisy) { X1 <- perturb.by.noise(X, perc); r<-kmeans(t(X1), c=c, iter.max = 1000); cl1 <- Transform.vector.to.list(r$cluster); M1 <- Do.boolean.membership.matrix(cl1, n, 1:n); X2 <- perturb.by.noise(X, perc); r<-kmeans(t(X2), c=c, iter.max = 1000); cl2 <- Transform.vector.to.list(r$cluster); M2 <- Do.boolean.membership.matrix(cl2, n, 1:n); sim.vector[i] <- s(M1,M2); } return(sim.vector); } ######################################################## ######### PAM CLUSTERING ############################### ######################################################## ############################################################################################ # Function to compute cluster similarities for a given numer of clusters using # PAM clustering. # A vector of similarity measures between pairs of clusterings perturbed with resampling techniques is computed for a given number of clusters. # The fraction of the resampled # data (without replacement) and the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nsub : number of subsamples # f : fraction of the data resampled without replacement # s : similarity function to be used (default Fowlkes and Mallows similarity measure) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : parameter used for internal compatibility. # Output: # vector of the computed similarity measures (length equal to nsub) PAM.sim.resampling <- function(X, c = 2, nsub=100, f = 0.8, s = sFM, distance="euclidean", hmethod=NULL) { n <- ncol(X); n.sub.ex <- ceiling(n * f); sim.vector <- numeric(0); for (i in 1:nsub) { sub1 <- sample(n, n.sub.ex); Xsub1 <- X[,sub1]; colnames(Xsub1)<-sub1; if (distance == "euclidean") d <- dist (t(Xsub1)) else d <- as.dist(1 - cor(Xsub1)); r <- pam (d,c,cluster.only=TRUE); cl1 <- Transform.vector.to.list(r); M1 <- Do.boolean.membership.matrix(cl1, n.sub.ex, sub1); sub2 <- sample(n, n.sub.ex); Xsub2 <- X[,sub2]; colnames(Xsub2)<-sub2; if (distance == "euclidean") d <- dist (t(Xsub2)) else d <- as.dist(1 - cor(Xsub2)); r <- pam (d,c,cluster.only=TRUE); cl2 <- Transform.vector.to.list(r); M2 <- Do.boolean.membership.matrix(cl2, n.sub.ex, sub2); # examples common two the two subsamples sub.common <- Intersect(sub1,sub2); # extract from the membership matrices the rows and columns that # correspond to the examples common two the two subsamples label.examples <- as.character(sub.common); M1 <- M1[label.examples, label.examples]; M2 <- M2[label.examples, label.examples]; sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Function to compute cluster similarities for a given numer of clusters using # randomized maps and PAM clustering. # A vector of similarity measures between pairs of clusterings perturbed with random projections is computed for a given number of clusters. # The dimension of the projected # data, the type of randomized map, the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nprojections : number of pairs of projected data # dim : dimension of the projected data # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # scale : if TRUE randomized projections are scaled (default) # seed : numerical seed for the random generator # s : similarity function to be used (default Fowlkes and Mallows similarity measure) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : parameter used for internal compatibility. # Output: # vector of the computed similarity measures (length equal to nsub) PAM.sim.projection <- function(X, c = 2, nprojections=100, dim=2, pmethod="PMO", scale=TRUE, seed=100, s=sFM, distance="euclidean", hmethod=NULL) { n <- ncol(X); sim.vector <- numeric(nprojections); cl <- Multiple.Random.PAM (X, dim=dim, pmethod=pmethod, c=c, n=nprojections*2, scale=scale, seed=seed, distance=distance); for (i in 1:nprojections) { cl1 <- cl[[i]]; M1 <- Do.boolean.membership.matrix(cl1, n, 1:n); cl2 <- cl[[i+nprojections]]; M2 <- Do.boolean.membership.matrix(cl2, n, 1:n); sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Function to compute cluster similarities for a given number of clusters using # injections of random noise and PAM clustering. # A vector of similarity measures between pairs of clusterings perturbed with random noise is computed for a given number of clusters. # The variance of the added gaussian noise, estimated from the data as the perc percentile of the standard deviations of the input variables, # the percentile itself and the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nnoisy : number of pairs of noisy data # perc : percentile of the standard deviations to be used for the added gaussian noise (def. 0.5) # s : similarity function to be used (default Fowlkes and Mallows similarity measure) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : parameter used for internal compatibility. # Output: # vector of the computed similarity measures (length equal to nsub) PAM.sim.noise <- function(X, c = 2, nnoisy=100, perc=0.5, s = sFM, distance="euclidean", hmethod=NULL) { n <- ncol(X); sim.vector <- numeric(nnoisy); for (i in 1:nnoisy) { X1 <- perturb.by.noise(X, perc); if (distance == "euclidean") d <- dist (t(X1)) else d <- as.dist(1 - cor(X1)); r <- pam (d,c,cluster.only=TRUE); cl1 <- Transform.vector.to.list(r); M1 <- Do.boolean.membership.matrix(cl1, n, 1:n); X2 <- perturb.by.noise(X, perc); if (distance == "euclidean") d <- dist (t(X2)) else d <- as.dist(1 - cor(X2)); r <- pam (d,c,cluster.only=TRUE); cl2 <- Transform.vector.to.list(r); M2 <- Do.boolean.membership.matrix(cl2, n, 1:n); sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################ ######### FUZZY K-MEANS CLUSTERING ######################### ############################################################ ############################################################################################ # Function to compute cluster similarities for a given numer of clusters using # Fuzzy kmeans clustering. # A vector of similarity measures between pairs of clusterings perturbed with resampling techniques is computed for a given number of clusters. # The fraction of the resampled # data (without replacement) and the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nsub : number of subsamples # f : fraction of the data resampled without replacement # s : similarity function to be used (default Fowlkes and Mallows similarity measure # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : parameter used for internal compatibility. # Output: # vector of the computed similarity measures (length equal to nsub) Fuzzy.kmeans.sim.resampling <- function(X, c = 2, nsub=100, f = 0.8, s = sFM, distance="euclidean", hmethod=NULL) { n <- ncol(X); n.sub.ex <- ceiling(n * f); sim.vector <- numeric(nsub); for (i in 1:nsub) { sub1 <- sample(n, n.sub.ex); Xsub1 <- X[,sub1]; colnames(Xsub1)<-sub1; if (distance == "euclidean") d <- dist (t(Xsub1)) else d <- as.dist(1 - cor(Xsub1)); r<- fanny(d, c, maxit = 1000); cl1 <- Transform.vector.to.list(r$clustering); M1 <- Do.boolean.membership.matrix(cl1, n.sub.ex, sub1); sub2 <- sample(n, n.sub.ex); Xsub2 <- X[,sub2]; colnames(Xsub2)<-sub2; if (distance == "euclidean") d <- dist (t(Xsub2)) else d <- as.dist(1 - cor(Xsub2)); r<- fanny(d, c, maxit = 1000); cl2 <- Transform.vector.to.list(r$clustering); M2 <- Do.boolean.membership.matrix(cl2, n.sub.ex, sub2); # examples common two the two subsamples sub.common <- Intersect(sub1,sub2); # extract from the membership matrices the rows and columns that # correspond to the examples common two the two subsamples label.examples <- as.character(sub.common); M1 <- M1[label.examples, label.examples]; M2 <- M2[label.examples, label.examples]; sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Function to compute cluster similarities for a given number of clusters using # randomized maps and Fuzzy kmeans clustering. # A vector of similarity measures between pairs of clusterings perturbed with random projections is computed for a given number of clusters. # The dimension of the projected # data, the type of randomized map, the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nprojections : number of pairs of projected data # dim : dimension of the projected data # pmethod : projection method. It must be one of the following: # "RS" (random subspace projection) # "PMO" (Plus Minus One random projection) # "Norm" (normal random projection) # "Achlioptas" (Achlioptas random projection) # scale : if TRUE randomized projections are scaled (default) # seed : numerical seed for the random generator # s : similarity function to be used (default Fowlkes and Mallows similarity measure) # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : parameter used for internal compatibility. # Output: # vector of the computed similarity measures (length equal to nsub) Fuzzy.kmeans.sim.projection <- function(X, c = 2, nprojections=100, dim=2, pmethod="PMO", scale=TRUE, seed=100, s=sFM, distance="euclidean", hmethod=NULL) { n <- ncol(X); sim.vector <- numeric(nprojections); cl <- Multiple.Random.fuzzy.kmeans (X, dim=dim, pmethod=pmethod, c=c, n=nprojections*2, scale=scale, seed=seed, distance=distance); for (i in 1:nprojections) { cl1 <- cl[[i]]; M1 <- Do.boolean.membership.matrix(cl1, n, 1:n); cl2 <- cl[[i+nprojections]]; M2 <- Do.boolean.membership.matrix(cl2, n, 1:n); sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Function to compute cluster similarities for a given number of clusters using # injections of random noise and Fuzzy kmeans clustering. # A vector of similarity measures between pairs of clusterings perturbed with random noise is computed for a given number of clusters. # The variance of the added gaussian noise, estimated from the data as the perc percentile of the standard deviations of the input variables, # the percentile itself and the similarity measure can be selected. # Input: # X : matrix of data (variables are rows, examples columns) # c : number of clusters # nnoisy : number of pairs of noisy data # perc : percentile of the standard deviations to be used for the added gaussian noise (def. 0.5) # s : similarity function to be used (default Fowlkes and Mallows similarity measure # distance : it must be one of the two: "euclidean" (default) or "pearson" (that is 1 - Pearson correlation) # hmethod : parameter used for internal compatibility. # Output: # vector of the computed similarity measures (length equal to nsub) Fuzzy.kmeans.sim.noise <- function(X, c = 2, nnoisy=100, perc=0.5, s = sFM, distance="euclidean", hmethod=NULL) { n <- ncol(X); sim.vector <- numeric(nnoisy); for (i in 1:nnoisy) { X1 <- perturb.by.noise(X, perc); if (distance == "euclidean") d <- dist (t(X1)) else d <- as.dist(1 - cor(X1)); r<- fanny(d, c, maxit = 1000); cl1 <- Transform.vector.to.list(r$clustering); M1 <- Do.boolean.membership.matrix(cl1, n, 1:n); X2 <- perturb.by.noise(X, perc); if (distance == "euclidean") d <- dist (t(X2)) else d <- as.dist(1 - cor(X2)); r<- fanny(d, c, maxit = 1000); cl2 <- Transform.vector.to.list(r$clustering); M2 <- Do.boolean.membership.matrix(cl2, n, 1:n); sim.vector[i] <- s(M1,M2); } return(sim.vector); } ############################################################################################ # Other functions for specific clustering algorithms may be implemented simply using the general # scheme of the previous functions. ############################################################################################ ############################################################################## # Function to compute and build up a pairwise boolean membership matrix. # This function may be used also with clusterings that do not define strictly a partition of the data and using # variable number of clusters for each clustering. # Input: # cl : a clustering (list of vectors defining a clustering) # dim.M : dimension of the similarity matrix (number of examples) # examplelabels : labels of the examples drawn from the original data. # Output: # M : the pairwise boolean membership matrix. whose elements are 1 if 2 examples fall in the same cluster, 0 otherwise Do.boolean.membership.matrix <- function(cl, dim.M, examplelabels) { M <- matrix(integer(dim.M*dim.M), nrow=dim.M); colnames(M) <- rownames(M) <- examplelabels; singletons <- integer(dim.M); c <- length(cl); # number of clusters for (j in 1:c) { n.ex <- length(cl[[j]]); if (n.ex == 1) singletons[cl[[j]][1]] <- 1 else { for (x1 in 1:(n.ex-1)) { for (x2 in (x1+1):n.ex) { x <- cl[[j]][x1]; y <- cl[[j]][x2]; M[x,y] <- 1; } } } } for (x1 in 1:(dim.M-1)) for (x2 in (x1+1):dim.M) M[x2,x1] <- M[x1,x2]; for (x in 1:(dim.M)) M[x,x] <- singletons[x]; return(M); } ############################################################################## # Function to compute the intersection between elements of two vectors # Having as input two sets of elements represented by two vectors, the intersection between the two sets is performed and # the corresponding vector is returned. # Input: # sub1 : first vector # sub2 : second vector # Output: # sub.int : vector that stores the elements common to the two input vectors Intersect <- function(sub1,sub2) { sub.int <- integer(0); n1 <- length(sub1); n2 <- length(sub2); k <- 0; for(i in 1:n1) for(j in 1:n2) if (sub1[i] == sub2[j]) { k <- k + 1; sub.int[k] <- sub1[i]; break; } return(sub.int); } ############################################################################## # Function to compute the Fowlkes and Mallows similarity between two clusterings # Input: # M1 : boolean memberhip matrix representing the first clustering # M2 : boolean membership matrix representing the second clustering # Output: # res : similarity measure between the two clusterings according to Fowlkes and Mallows sFM <- function(M1,M2) { res <- sum(M1*M2) / sqrt(sum(M1*M1) * sum(M2*M2)); return(res); } ############################################################################## # Function to compute the Jaccard similarity coefficient between two clusterings # Input: # M1 : boolean membership matrix representing the first clustering # M2 : boolean membership matrix representing the second clustering # Output: # res : similarity measure between the two clusterings according to the Jaccard coefficient sJaccard <- function(M1,M2) { res <- sum(M1*M2) / (sum(M1*M1) + sum(M2*M2) - sum(M1*M2)); return(res); } ############################################################################## # Function to compute the Matching coefficient between two clusterings # Input: # M1 : boolean membership matrix representing the first clustering # M2 : boolean membership matrix representing the second clustering # Output: # res : similarity measure between the two clusterings according to the Matching coefficient sM <- function(M1,M2) { n <- nrow(M1); res <- 1 - sum((M1-M2)^2)/n^2; return(res); } ############################################################################## # Function to plot the histogram of similarities for a given number of clusters. # Input: # sim : vector of similarity values # nbins : number of the bins of the histogram plot.hist.similarity <- function(sim, nbins=25) { num.samples <- length(sim); minimum <- min(sim); if (minimum> 0.5) minimum <- 0.5; hist(sim, breaks=seq(minimum,1,length=nbins), ylim=c(0,num.samples), main="", xlab="Similarity"); } ############################################################################## # Function to plot multiple histograms of similarities for different number of clusters. # This function outputs a set of histograms (one for each number of clusters, i.e. one for each row of # of the matrix S of similarity values). # Input: # S : Matrix of similarity values # n.col : number of columns in the grid of the histograms # nbins : number of the bins of the histogram # labels : label of the histograms. If NULL (default) the number of clusters from 2 to nrow(S)+1 are used. plot.multiple.hist.similarity <- function(S, n.col=3, labels=NULL, nbins=25) { num.samples <- ncol(S); n.graph <- nrow(S); n.row <- ceiling(n.graph/n.col); op <- par(mfrow=c(n.row,n.col)); minimum <- min(S); if (minimum> 0.5) minimum <- 0.5; if (is.null(labels)) lab <- 2:(n.graph+1) else lab <- labels; for (i in 1:n.graph) { title <- paste("k = ", lab[i]); hist(S[i,], breaks=seq(minimum,1,length=nbins), ylim=c(0,num.samples), main="", xlab=title, ylab=""); } par(op); } ############################################################################## # Function to compute the empirical cumulative distribution function (ECDF) of the similarity measures for different number of clusters # Input: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings. # Each row corresponds to a different number of clusters; number of columns # equal to the number of subsamples considered for each number of clusters. # Output: # list.F : a list of function of class ecdf compute.cumulative.multiple <- function(sim.matrix) { list.F <- list(); n <- nrow(sim.matrix); for (i in 1:n) list.F[[i]] <- ecdf(sim.matrix[i,]); return(list.F); } ############################################################################## # Function that return the values of the empirical cumulative distribution # Input: # F : Function of class ecdf that stores the discrete values of the cumulative distribution # Output: # l : a list with two elements: the "x" element stores a vector with the values of the random variable for # which the cumulative distribution needs to be computed; the "y" element stores a vector with the corresponding # values of the cumulative distribution (i.e. y = F(x)). cumulative.values <- function(F) { e <- environment(F); l <- list(x=e$x, y=e$y); return(l); } ############################################################################## # Function to plot the ECDF for the similarity values # Input: # F : Function of class ecdf that stores the discrete values of the cumulative distribution # Output: # plot of the cumulative distribution plot.cumulative <- function(F) { plot(F, xlim=c(0,1), ylim=c(0,1), xlab="similarities", ylab="cumulative distribution", main="", do.points=FALSE, verticals=TRUE); } ############################################################################## # Function to plot a set of ECDF for the similarity values. # Graphs of the empirical cumulative distributions corresponding to different number of clusters are plotted, using different # patterns and/or different colors for each graph. Up to 15 ecdf graphs can be plotted simultaneously. # Input: # list.F : a list of function of class ecdf # labels : vector of the labels associated to the CDF. If NULL (default), then a vector of labels from 2 to lenght(list.F)+1 # is used. # min.x : minimum value to be plotted for similarities. If -1 (default) the minimum of the similarity value is obtained # from list.F # colors : if TRUE (default) different colors are used to plot the different ECDF, otherwise black lines are used # Output: # plot of the cumulative distributions plot.cumulative.multiple <- function(list.F, labels=NULL, min.x=-1, colors=TRUE) { MAX_INTEGRALS=15; minimum=1; num.F <- length(list.F); if (num.F > 15) num.F <- 15; if (min.x == -1) { for (i in 1:num.F) { l <- cumulative.values (list.F[[i]]); localmin <- min(l$x); if (localmin1) if (colors == TRUE) for (i in 1:n.test) { lines(1:n.clusterings,l[[i]]$p.value,lty=lpat[i], type="o", col=i) # text(l$x[1],l$y[1],pos=1,labels=lab[i]); } else for (i in 1:n.test) { lines(1:n.clusterings,l[[i]]$p.value,lty=lpat[i], type="o"); # text(l$x[1],l$y[1],pos=1,labels=lab[i]); } lines(1:n.clusterings,rep(alpha,n.clusterings),lty=2, type="l"); text(ceiling(n.clusterings/2),alpha,labels=paste("alpha=", alpha)); # making legend if (is.null(leg_label)) leg_label <- paste("test ",1:n.test,sep=""); if (legendy!=0) if (colors == TRUE) legend(1,legendy, legend=leg_label,lty=lpat[1:n.test], col=1:n.test) else legend(1,legendy, legend=leg_label,lty=lpat[1:n.test]); } ############################################################################## # Function to compute the integral of the ECDF similarity values # Input: # F : Function of class ecdf that stores the discrete values of the empirical cumulative distribution # subdivisions : maximum number of subintervals used by the integration process # Output: # v : value of the estimate integral compute.integral <- function(F, subdivisions=1000) { l <- integrate(F, 0, 1, subdivisions=subdivisions); return(l$value); } ############################################################################## # Function to compute the integral of a ECDF using only the similarity matrix. # Each integral of the ECDF of the similarity is computed using the similarity measures between the clusterings # Input: # sim.matrix : a matrix that stores the similarity between pairs of clustering across multiple number of clusters # and multiple clusterings performed on subsamples of the original data. # Number or rows equal to the different numbers of clusters considered; number of columns # equal to the number of subsamples considered for each number of clusters. # Output: # res : vector of the values of the estimate integrals compute.integral.from.similarity <- function(sim.matrix) { return(as.vector(1 - mean(as.data.frame(t(sim.matrix))))); } ############################################################################## # Function to generate a data set perturbed by noise. # Gaussian noise is added to the data. The mean of the gaussian noise is 0 and the standard deviation is estimated from the data. # The standard deviation of the variables of the data is computed and the given perc (percentile) is obtained. # The standard deviation of the gaussian added noise is set to the obtained percentile value (see the McShane et al. paper for details). # Input: # X : matrix of data (variables are rows, examples columns) # perc : percentile of the standard deviation (def: 0.5) # Output: # X.noisy : matrix of perturbed data (variables are rows, examples columns) perturb.by.noise <- function(X, perc = 0.5) { if ((perc<=0)||(perc>1)) stop("Percentile must be between 0 and 1"); n.var <- nrow(X); n.samples <- ncol(X); sdev <- sort(sd(t(X))); noise.sd <- sdev[ceiling(n.var*perc)]; X.noisy <- matrix(rnorm(n.var*n.samples, sd=noise.sd), nrow=n.var); X.noisy <- X + X.noisy; return(X.noisy); }