# READ AND DISPLAY DATA bcw = read.csv("BCW/breast-cancer-wisconsin.data") bcw[1,] bcw[,1] c(1,2,4) names(bcw) = c("ID","ClumpThick","UCSize","UCShape","A5","A6","A7","A8","A9","A10","A11") bcw[1,] bcw[,"A7"] bcw[1:30,1:10] # BUILD FREQUENCY and CONTINGENCY TABLES table(bcw[,2]) table(bcw[,2:3]) # basic statistics with built-in functions (mean, median, min, max, midrange, variance, standard deviation, quartiles, IQR, ) wdbc = read.csv("BCW/wdbc.data") mean(wdbc[,3]) median(wdbc[,3]) min(wdbc[,3]) max(wdbc[,3]) max(wdbc[,3]) - min(wdbc[,3]) ?var var(wdbc[,3]) sqrt(var(wdbc[,3])) var(wdbc[,3])^0.5 quantile(wdbc[,3]) # 0% 25% 50% 75% 100% # 6.9810 11.6975 13.3550 15.7800 28.1100 quantile(wdbc[,3],0.11) # 11% #10.4837 # computation of the mode linking built-in functions which.max(table(bcw[,6])) # data visualization (histograms, boxplot, scatter plot) boxplot(wdbc[,2:ncol(wdbc)]) boxplot(bcw[,2:ncol(bcw)]) plot(wdbc[,3]) plot(wdbc[,3:6]) hist(bcw[,2]) qqplot(wdbc[,3],wdbc[,4]) # - S programming: assignments, comparisons, conditional statements, loops # handling vectors v = wdbc[,3] v = v - 10.0 v = v / 10.0 w = wdbc[,4] v*w v %*% w # - S programming: definition of functions # - Example: re-implementing the mean() function # - Homework: re-implement mean, variance, standard deviation, median; implement # SIMILARITY MEASURES a = c(1,0,0,1) b = c(1,0,0,0) table(a,b) jaccard(a,b) table(t(bcw[1,]),t(bcw[2,])) cat_d(t(bcw[1,]),t(bcw[2,])) minkowski(t(wdbc[1,3:6]),t(wdbc[2,3:6]),2) # PREPROCESSING # a replace(x,y,T) function to replace each occurrence of a value x with value y in table T myrepl <- function(X, a, b) { for (i in 1:length(X)){ if (X[i] == a) X[i] = b; } X } mysrepl <- function(a, b) { f <- function(x) { v = x; if (x == a) v = b; v } } aapply <- function(X, f) { for (i in 1:length(X)) X[i] = f(X[i]) X } # SEARCH FOR MISSING VALUES IN bcw for(i in 1:ncol(bcw)) for(j in 1:nrow(bcw)) if (! is.numeric(bcw[j,i])) print(c(i,j,bcw[j,i])) aapply(bcw[,7],mysrepl("?",0)) # correlation / covariance # Chi square test bt = table(bcw[,2:3]) a = table(bcw[,2]) b = table(bcw[,3]) N = sum(a) e = matrix(nrow=length(a),ncol=length(b)) for(i in 1:nrow(e)) for (j in 1:ncol(e)) e[i,j] = a[i]*b[j]/N for(i in 1:nrow(e)) for (j in 1:ncol(e)) chi = chi + (bt[i,j] - e[i,j])^2 / e[i,j] # Correlation and Covariance on Real Data