###### PART 1: CLASSIFICATION ######

# LOAD THE IRIS DATASET AND CHECK IT ...

data(iris)

# ASSESS DATASET (HOW MANY ATTR., TYPE, MISSING VALS, MEAN, STDDEV, BOXPLOT) ...
# wiki to work with dataset in R
# http://en.wikibooks.org/wiki/R_Programming/Working_with_data_frames# some hints...

head(iris)
str(iris)
names(iris)
summary(iris)

# TASK: is it possible to predict the Species looking at the other attributes?
# --> CLASSIFICATION
# How to do it in R? The e1071 package
# http://cran.r-project.org/web/packages/e1071/e1071.pdf
# INSTALL AND LOAD THE e1071 package
# http://cran.r-project.org/web/packages/e1071/index.html

install.packages('e1071',dependencies=TRUE)
library(e1071)

# CLASSIFICATION METHOD 1: NAIVE BAYES
# use naiveBayes(Data, Labels)

# TRAINING

iris_bayes = naiveBayes(iris[,1:4],iris[,5])

# explore the model

iris_bayes$apriori
iris_bayes$tables

# USING
# pick a random flower and use the model

f = iris[140,1:4]
predict(iris_bayes,f)

# *** A REAL CHECK: TRAIN + TEST
# randomly split the dataset:
iris_test = sample(1:nrow(iris),50)
iris_train = setdiff(1:nrow(iris),iris_test)

iris_bayes_2 = naiveBayes(iris[iris_train,1:4],iris[iris_train,5])

res = predict(iris_bayes_2,iris[iris_test,1:4])

# display a confusion matrix
table(res,iris[iris_test,5])
# compute accuracy
cm_iris = table(res,iris[iris_test,5])
# standard way to compute the sum of diagonal values in a matrix
correct = 0; for(i in 1:ncol(cm_iris)) correct = correct + cm_iris[i,i]
# R simplified syntax to compute the sum of diagonal values in a matrix
correct  = sum(diag(cm_iris))
accuracy = correct / sum(cm_iris)

# try to repeat from *** and observe how internal structure (apriori & tables) and accuracy change

# CLASSIFICATION METHOD 2: SUPPORT VECTOR MACHINES

iris_svm <- svm(iris[iris_train,1:4],iris[iris_train,5])

# alternative way of using "svm" function
# i.e. 1st argument is a formula of the form "OutputAttribute ~ InputAttribute1 + InputAttribute2 + ...
# 2nd argument is a training set, containing both input and labels
iris_svm <- svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, iris[iris_train,])
#or previous notation iris_scm = svm(iris[iris_train,1:4],iris[iris_train,5])

# look at the internal structure of the SVM model

iris_svm$...

res = predict(iris_svm,iris[iris_test,1:4])
cm_iris = table(res,iris[iris_test,5])

# again, try with different train/test and compute accuracy
correct  = sum(diag(cm_iris))
accuracy = correct / sum(cm_iris)

# CLASSIFICATION METHOD 3: DECISION TREES (INFORMATION-GAIN BASED MODEL)

# using rpart package http://cran.r-project.org/web/packages/rpart/rpart.pdf
install.packages("rpart",dependencies =TRUE)
library(rpart)

#The splitting index can be gini or information, defaults to gini index

iris_dt <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris[iris_train,], method = 'class')

# to use information gain 
# iris_dt <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, method = 'class', parms = list(split = "information") )

# explore model

printcp(iris_dt)	# Prints a table of optimal prunings based on a complexity parameter
summary(iris_dt)	# summary of the decision tree with splitting decisions
plotcp(iris_dt)		
x11();
plot(iris_dt)	# p graphical lot decision tree
text(iris_dt, use.n=TRUE) # add label to decision tree

# use model

#iris_test = sample(1:nrow(iris),50)
#iris_train = setdiff(1:nrow(iris),iris_test)

# predict method parameter type defines the output type. With value 'class' the output is a vector of classes, with value 'raw' the output for each test item we have a score of membership for each class
# example with type = 'raw'#predict(iris_dt,iris[iris_test,att_col],type = 'raw') 
#    setosa versicolor  virginica
#23       1 0.00000000 0.00000000
#6        1 0.00000000 0.00000000
#56       0 0.90740741 0.09259259
# example with type = 'class'
#predict(iris_dt,iris[iris_test,att_col],type = 'class')
# [1] setosa setosa versicolor ...
 
res = predict(iris_dt,iris[iris_test,1:4],type='class')
cm_iris = table(res,iris[iris_test,5])
correct  = sum(diag(cm_iris))
accuracy = correct / sum(cm_iris)

correct  = sum(diag(cm_iris))
accuracy = correct / sum(cm_iris)

# LOAD THE ELECTIONS DATASET AND EXPERIMENT YOURSELF (we have already seen it) ...
# https://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records

install.packages("mlbench")
library(mlbench)
data(HouseVotes84)
str(HouseVotes84)
# warning: missing values!

HV_test = sample(1:nrow(HouseVotes84),150)
HV_train = setdiff(1:nrow(HouseVotes84),HV_test)
HV_bayes = naiveBayes(Class~.,HouseVotes84[HV_train,]) # new notation: the Class field contains labels, and the attributes are the remaining fields of the data frame
HV_bayes = naiveBayes(x = HouseVotes84[HV_train,2:17],y = HouseVotes84[HV_train,1])
res = predict(HV_bayes,HouseVotes84[HV_test,],type="class")
cm_housevotes = table(res,HouseVotes84[HV_test,1])
correct = sum(diag(cm_housevotes))
accuracy = correct / sum(cm_housevotes)

###### PART 2: CLUSTERING ######

install.packages("fpc", dependencies = TRUE)
library(fpc)

# let's make a test: delete class labels from iris

unsupervised_iris = iris[,1:4]

# CLUSTERING k means
# look at the options of the kmeans function
?kmeans
# try to use it
res = kmeans(unsupervised_iris,3)
res$
# let's check the "predictive" power
table(res$cluster,iris[,5])


# that's unfair: we knew that 3 clusters were in the dataset since the beginning!!!
# try with different k values, and score the results 
# compute inter-cluster separation and inter-cluster cohesion with different k. The k with highest tradeoff is chosen

res$withinss
res$betweenss

plotcluster(unsupervised_iris, res$cluster)

# CLUSTERING density based

?dbscan
res = dbscan(unsupervised_iris,eps=0.4)
res$ #(+ autocompletion)
table(res$cluster,iris[,5])
plotcluster(unsupervised_iris, res$cluster)
# try with different eps and MinPts values, and score the results
# eps = maximum radius of the neighborhood
# MinPts = minimum number of points in an eps-neighborhood of a point

# the dbscan can automatically print the plot with produced clusters
x11();
res = dbscan(unsupervised_iris,MinPts = 5,eps=0.4,showplot = 1) # white points are outliers

# CLUSTERING hierarchical

?hclust
?dist
d = dist(unsupervised_iris)
res = hclust(d, method="complete")	# with complete link method: largest distance between an element in one cluster and an element in the other
res$ #(+ autocompletion)
plot(res)
cluster = cutree(res,h=4)	# cut the dendogram at a certain level
table(cluster,iris[,5])
cluster = cutree(res,k=3)	# cut the dendogram with a certain number of clusters
table(cluster,iris[,5])