###### PART 1: CLASSIFICATION ###### # LOAD THE IRIS DATASET AND CHECK IT ... data(iris) # ASSESS DATASET (HOW MANY ATTR., TYPE, MISSING VALS, MEAN, STDDEV, BOXPLOT) ... # wiki to work with dataset in R # http://en.wikibooks.org/wiki/R_Programming/Working_with_data_frames # some hints... head(iris) str(iris) names(iris) summary(iris) # TASK: is it possible to predict the Species looking at the other attributes? # --> CLASSIFICATION # How to do it in R? The e1071 package # http://cran.r-project.org/web/packages/e1071/e1071.pdf # INSTALL AND LOAD THE e1071 package # http://cran.r-project.org/web/packages/e1071/index.html install.packages('e1071',dependencies=TRUE) library(e1071) # CLASSIFICATION METHOD 1: NAIVE BAYES # use naiveBayes(Data, Labels) # TRAINING iris_bayes = naiveBayes(iris[,1:4],iris[,5]) # explore the model iris_bayes$apriori iris_bayes$tables # USING # pick a random flower and use the model f = iris[140,1:4] predict(iris_bayes,f) # *** A REAL CHECK: TRAIN + TEST # randomly split the dataset: iris_test = sample(1:nrow(iris),50) iris_train = setdiff(1:nrow(iris),iris_test) iris_bayes_2 = naiveBayes(iris[iris_train,1:4],iris[iris_train,5]) res = predict(iris_bayes_2,iris[iris_test,1:4]) # display a confusion matrix table(res,iris[iris_test,5]) # compute accuracy cm_iris = table(res,iris[iris_test,5]) # standard way to compute the sum of diagonal values in a matrix correct = 0; for(i in 1:ncol(cm_iris)) correct = correct + cm_iris[i,i] # R simplified syntax to compute the sum of diagonal values in a matrix correct = sum(diag(cm_iris)) accuracy = correct / sum(cm_iris) # try to repeat from *** and observe how internal structure (apriori & tables) and accuracy change # CLASSIFICATION METHOD 2: SUPPORT VECTOR MACHINES iris_svm <- svm(iris[iris_train,1:4],iris[iris_train,5]) # alternative way of using "svm" function # i.e. 1st argument is a formula of the form "OutputAttribute ~ InputAttribute1 + InputAttribute2 + ... # 2nd argument is a training set, containing both input and labels iris_svm <- svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, iris[iris_train,]) #or previous notation iris_scm = svm(iris[iris_train,1:4],iris[iris_train,5]) # look at the internal structure of the SVM model iris_svm$... res = predict(iris_svm,iris[iris_test,1:4]) cm_iris = table(res,iris[iris_test,5]) # again, try with different train/test and compute accuracy correct = sum(diag(cm_iris)) accuracy = correct / sum(cm_iris) # CLASSIFICATION METHOD 3: DECISION TREES (INFORMATION-GAIN BASED MODEL) # using rpart package http://cran.r-project.org/web/packages/rpart/rpart.pdf install.packages("rpart",dependencies =TRUE) library(rpart) #The splitting index can be gini or information, defaults to gini index iris_dt <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris[iris_train,], method = 'class') # to use information gain # iris_dt <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, method = 'class', parms = list(split = "information") ) # explore model printcp(iris_dt) # Prints a table of optimal prunings based on a complexity parameter summary(iris_dt) # summary of the decision tree with splitting decisions plotcp(iris_dt) x11(); plot(iris_dt) # p graphical lot decision tree text(iris_dt, use.n=TRUE) # add label to decision tree # use model #iris_test = sample(1:nrow(iris),50) #iris_train = setdiff(1:nrow(iris),iris_test) # predict method parameter type defines the output type. With value 'class' the output is a vector of classes, with value 'raw' the output for each test item we have a score of membership for each class # example with type = 'raw' #predict(iris_dt,iris[iris_test,att_col],type = 'raw') # setosa versicolor virginica #23 1 0.00000000 0.00000000 #6 1 0.00000000 0.00000000 #56 0 0.90740741 0.09259259 # example with type = 'class' #predict(iris_dt,iris[iris_test,att_col],type = 'class') # [1] setosa setosa versicolor ... res = predict(iris_dt,iris[iris_test,1:4],type='class') cm_iris = table(res,iris[iris_test,5]) correct = sum(diag(cm_iris)) accuracy = correct / sum(cm_iris) correct = sum(diag(cm_iris)) accuracy = correct / sum(cm_iris) # LOAD THE ELECTIONS DATASET AND EXPERIMENT YOURSELF (we have already seen it) ... # https://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records install.packages("mlbench") library(mlbench) data(HouseVotes84) str(HouseVotes84) # warning: missing values! HV_test = sample(1:nrow(HouseVotes84),150) HV_train = setdiff(1:nrow(HouseVotes84),HV_test) HV_bayes = naiveBayes(Class~.,HouseVotes84[HV_train,]) # new notation: the Class field contains labels, and the attributes are the remaining fields of the data frame HV_bayes = naiveBayes(x = HouseVotes84[HV_train,2:17],y = HouseVotes84[HV_train,1]) res = predict(HV_bayes,HouseVotes84[HV_test,],type="class") cm_housevotes = table(res,HouseVotes84[HV_test,1]) correct = sum(diag(cm_housevotes)) accuracy = correct / sum(cm_housevotes) ###### PART 2: CLUSTERING ###### install.packages("fpc", dependencies = TRUE) library(fpc) # let's make a test: delete class labels from iris unsupervised_iris = iris[,1:4] # CLUSTERING k means # look at the options of the kmeans function ?kmeans # try to use it res = kmeans(unsupervised_iris,3) res$ # let's check the "predictive" power table(res$cluster,iris[,5]) # that's unfair: we knew that 3 clusters were in the dataset since the beginning!!! # try with different k values, and score the results # compute inter-cluster separation and inter-cluster cohesion with different k. The k with highest tradeoff is chosen res$withinss res$betweenss plotcluster(unsupervised_iris, res$cluster) # CLUSTERING density based ?dbscan res = dbscan(unsupervised_iris,eps=0.4) res$ #(+ autocompletion) table(res$cluster,iris[,5]) plotcluster(unsupervised_iris, res$cluster) # try with different eps and MinPts values, and score the results # eps = maximum radius of the neighborhood # MinPts = minimum number of points in an eps-neighborhood of a point # the dbscan can automatically print the plot with produced clusters x11(); res = dbscan(unsupervised_iris,MinPts = 5,eps=0.4,showplot = 1) # white points are outliers # CLUSTERING hierarchical ?hclust ?dist d = dist(unsupervised_iris) res = hclust(d, method="complete") # with complete link method: largest distance between an element in one cluster and an element in the other res$ #(+ autocompletion) plot(res) cluster = cutree(res,h=4) # cut the dendogram at a certain level table(cluster,iris[,5]) cluster = cutree(res,k=3) # cut the dendogram with a certain number of clusters table(cluster,iris[,5])