## Supporto alla diagnosi biomolecolare con SVM. ########## Code Chunk: loadlibs ########## library(golubEsets) library(e1071) library(Biobase) # library(genefilter) ############################################################ ############################################################ # 1. Preparazione dei dati per il training ed il testing ############################################################ ############################################################ # Caricamento dati e "windowsizing" data(Golub_Train) X <- exprs(Golub_Train) Wlow <- 100 Whigh <- 16000 X[XWhigh] <- Whigh # Creazione di una funzione filtro # La funzione riceve in ingresso un vettore x e ritorna TRUE se max(x) /min(x ) > r AND max(x)-min(x) > d # Il vettore x rapresenta i livelli di espressione di un gene specifico nei diversi campioni analizzati filter.fun <- function(x, r=5, d=500, na.rm=TRUE) { minval <- min(x, na.rm=na.rm); maxval <- max(x, na.rm=na.rm); return ((maxval/minval > r) && (maxval-minval > d)) } # selezione dei geni tramite applicazione iterata di filter.fun sulle righe della matrice sub <- apply(X, 1, filter.fun); # Data set limitato ai soli geni selezionati X <- X[sub,] dim(X) golubTrainSub<-Golub_Train[sub,] # Preparazione del test set data(Golub_Test) Xt <- exprs(Golub_Test) Xt[XtWhigh] <- Whigh golubTestSub <- Golub_Test[sub,] ############################################################ ############################################################ # train e test di SVM ############################################################ ############################################################ ##1. Training di una SVM lineare Xm <- t(exprs(golubTrainSub)) resp <- golubTrainSub$ALL svm1 <- svm(Xm, resp, type="C-classification", kernel="linear") ###2. Calcolo dell' errore sul training set trpred <- predict(svm1, Xm) sum( trpred != resp) table(trpred, resp) ###3. Calcolo dell' errore sul test set Xmtr <- t(exprs(golubTestSub)) tepred <- predict(svm1, Xmtr) sum(tepred != golubTestSub$ALL) table(tepred, golubTestSub$ALL) ######4. Calcolo errore di 10-fold cross validation trcv <- svm(Xm, resp, type="C-classification", kernel="linear", cross=10) summary(trcv) ###5. Overfitting in spazi di grande dimensionalita' newlabs <- sample(c(rep("A",27), rep("B",11)), 38) funnysvm <- svm(Xm, newlabs, type="C-classification", kernel="linear") ###5.1 Ottimo errore sul training set fpred <- predict(funnysvm, Xm) sum( fpred != newlabs) table(fpred, newlabs) ###5.2 Pessimo risultato tramite cross validation trfcv <- svm(Xm, newlabs, type="C-classification", kernel="linear", cross=10) summary(trfcv)