next up previous
Next: Examples of the usage Up: Introduction to the functionalities Previous: Introduction to the functionalities


An example of the usage of mosclust with synthetic data

As an example of the usage of mosclust with synthetic data, we will consider a 1000-dimensional synthetic multivariate gaussian data set with relatively low cardinality (60 examples), characterized by a two-level hierarchical structure, highlighted by the projection of the data into the two main principal components (Fig. 1): indeed a two-level structure, with respectively 2 and 6 clusters is self-evident in the data.

Figure 1: A synthetic data set with a two-level hierarchical structure with 2 and 6 clusters (data projected into the two components with highest variance according to PCA).
\includegraphics[width = 12cm]{synthdata.eps}




The example source code of the R script to generate and to analyze the synthetic data is the following (note that the execution of the script may require few minutes on a desktop computer):

##################################################################################################
# example-synthetic.R
# September 2006
# 
# Example script  to evaluate the number of clusters of the data set generated through 
# the synthetic data generator generate.sample6 using mosclust with 2 clustering 
# algorithms and perturbation by resampling, addition of noise and random projections
##################################################################################################
library(mosclust);

# general parameters
nsubsamples <- 30;  # number of pairs of clusterings to be evaluated
max.num.clust <- 10; # maximum number of cluster to be evaluated
fract.resampled <- 0.8; # fraction of samples to subsampled
dim.projection <- JL.predict.dim(120,epsilon=0.2); # subspace dimension for random projections

# Data set generation (this function is available in the clusterv package)
M <- generate.sample6 (n=20, m=10, dim=1000, d=3, s=0.2);


##################################################
# Analysis with the PAM algorithm

# 1. Perturbation by resampling and computation of the similarity matrix
Sr.PAM.sample6 <- do.similarity.resampling(M, c=max.num.clust, nsub=nsubsamples, f=fract.resampled, s=sFM, 
                                      alg.clust.sim=PAM.sim.resampling);
																			
# Computation of the stability indices and the p-values according to the chi square-based test 
dr.PAM.sample6 <- Chi.square.compute.pvalues(Sr.PAM.sample6);

# Test of hypothesis to individuate clustering solution at 0.01 significance level
hr.PAM.sample6 <- Hypothesis.testing(dr.PAM.sample6, alpha=0.01);

# 2. Perturbation by noise and computation of the similarity matrix
Sn.PAM.sample6 <- do.similarity.noise(M, c=max.num.clust, nnoisy=nsubsamples, perc=0.5, s=sFM, 
                                      alg.clust.sim=PAM.sim.noise);
																			
# Computation of the stability indices and the p-values according to the chi square-based test
dn.PAM.sample6 <- Chi.square.compute.pvalues(Sn.PAM.sample6);

# Test of hypothesis to individuate clustering solution at 0.01 significance level
hn.PAM.sample6 <- Hypothesis.testing(dn.PAM.sample6, alpha=0.01);

# 3. Perturbation by random projections and computation of the similarity matrix
Sp.PAM.sample6 <- do.similarity.projection(M, c=max.num.clust, nprojections=nsubsamples, s=sFM, 
                                      alg.clust.sim=PAM.sim.projection);
																			
# Computation of the stability indices and the p-values according to the chi square-based test
dp.PAM.sample6 <- Chi.square.compute.pvalues(Sp.PAM.sample6);

# Test of hypothesis to individuate clustering solution at 0.01 significance level
hp.PAM.sample6 <- Hypothesis.testing(dp.PAM.sample6, alpha=0.01);

# saving objects 
save(Sr.PAM.sample6, dr.PAM.sample6, hr.PAM.sample6, Sn.PAM.sample6, dn.PAM.sample6, hn.PAM.sample6,
Sp.PAM.sample6, dp.PAM.sample6, hp.PAM.sample6, file="sample6.PAM.objects");


##################################################
# Analysis with the k-means algorithm

# 1. Perturbation by resampling and computation of the similarity matrix
Sr.Kmeans.sample6 <- do.similarity.resampling(M, c=max.num.clust, nsub=nsubsamples, f=fract.resampled, s=sFM, 
                                      alg.clust.sim=Kmeans.sim.resampling);

# Computation of the stability indices and the p-values according to the chi square-based test
dr.Kmeans.sample6 <- Chi.square.compute.pvalues(Sr.Kmeans.sample6);

# Test of hypothesis to individuate clustering solution at 0.01 significance level
hr.Kmeans.sample6 <- Hypothesis.testing(dr.Kmeans.sample6, alpha=0.01);

# 2. Perturbation by noise and computation of the similarity matrix
Sn.Kmeans.sample6 <- do.similarity.noise(M, c=max.num.clust, nnoisy=nsubsamples, perc=0.5, s=sFM, 
                                      alg.clust.sim=Kmeans.sim.noise);
# Computation of the stability indices and the p-values according to the chi square-based test
dn.Kmeans.sample6 <- Chi.square.compute.pvalues(Sn.Kmeans.sample6);

# Test of hypothesis to individuate clustering solution at 0.01 significance level
hn.Kmeans.sample6 <- Hypothesis.testing(dn.Kmeans.sample6, alpha=0.01);

# 3. Perturbation by random projections and computation of the similarity matrix
Sp.Kmeans.sample6 <- do.similarity.projection(M, c=max.num.clust, nprojections=nsubsamples, s=sFM, 
                                      alg.clust.sim=Kmeans.sim.projection);	
																																					
# Computation of the stability indices and the p-values according to the chi square-based test
dp.Kmeans.sample6 <- Chi.square.compute.pvalues(Sp.Kmeans.sample6);

# Test of hypothesis to individuate clustering solution at 0.01 significance level
hp.Kmeans.sample6 <- Hypothesis.testing(dp.Kmeans.sample6, alpha=0.01);

# saving objects 
save(Sr.Kmeans.sample6, dr.Kmeans.sample6, hr.Kmeans.sample6, Sn.Kmeans.sample6, dn.Kmeans.sample6, hn.Kmeans.sample6,
Sp.Kmeans.sample6, dp.Kmeans.sample6, hp.Kmeans.sample6, file="sample6.Kmeans.objects");
#######################################################################################################################