# Utility functions to process and analyze graphs # July 2014: added levels.graph, compute.flipped.graph, get.parents.top.down, get.children.bottom.up, get.parents, build.children # August 2014: added build.descendants, build.descendants.per.level, build.ancestors, build.ancestors.per.level # September 2014: added get.parents.bottom.up, get.children.top.down, build.ancestors.bottom.up and build.descendants.bottom.up # added check.hirarchy.single.sample, constraints.matrix # February 2016: added weighted.adjacency.matrix, specific.annotation.list, specific.annotation.matrix, full.annotation.matrix, # check.annotation.matrix.integrity, check.DAG.integrity, do.submatrix, do.subgraph # April 2016: correct bug in weighted.adjacency.matrix, add do.unstratified.cv.data # April 2016: added transitive.closure.annotations and modified full.annotation.matrix: might happen that there are same HPO IDs that # are classified as "obsolete" in obo file, but that still exist in the annotation file (e.g. hpo-DAG Build #1700 and hpo-ann Build #110) # May 2016: added do.my.subgraph. N.B.: do.my.subgraph is 0.6 sec slower than do.subgraph ( Tested on 2016-04-01 HPO obo and annotation release; # see the note below the function). # May 2016: root.node and find.leaves up to date and new function root.or.leaf added. Briefly, leaves are nodes with outdegree equal to zero (i.e. nodes # having no out-coming edges); root is the node with indegree equal to zero (i.e. node having no in-coming edge). library("graph"); library("RBGL"); #library("Rgraphviz"); # Function to group a set of nodes in according to their maximum depth in the graph. It simply inverts the weights of the graph # and then applies the Bellman Ford algorithm to find the shortest path, achieving in this way the longest path # Input: # g: a graph of class graphNEL # root : name of # the root node (def. 00) # Output: # a list of the nodes grouped w.r.t. the distance from the root: the first element of the list corresponds to the root node (level 0), # the second to nodes at maximum distance 1 (level 1), the third to the node at maximum distance 3 (level 2) and so on. levels.graph <- function(g,root="00") { ed <- edges(g); ew <- edgeWeights(g); # si cambia il segno ai pesi degli archi senza considerare i nodi foglia (nodi che non hanno figli) # GV: il codice e' corretto ma semplicemente escludi i nodi senza archi, i nodi foglia hanno invece almeno 1 arco. for(i in 1:length(ed)){ l <- length(ew[[i]]); if(l!=0) ew[[i]][1:l] <- -1; } edL <- vector(mode="list", length=length(ed)); names(edL) <- names(ed); for(i in 1:length(ed)){ edL[[i]] <- list(edges=ed[[i]], weights=ew[[i]]); } G <- graphNEL(nodes=nodes(g), edgeL=edL, edgemode="directed"); # grafo con i pesi degli archi negativi depth.G <- bellman.ford.sp(G,root)$distance; depth.G <- -depth.G # si settano le distanze positive levels <- vector(mode="list", length=max(depth.G)+1); names(levels) <- paste(rep("level", max(depth.G)+1), 0:max(depth.G), sep="_"); for(i in 1:(max(depth.G)+1)){ levels[[i]] <- names(which(depth.G==i-1)); # il livello 0 corrisponde al nodo root } return(levels); } # It computes a directed graph with edges in the opposite direction. # Input: # g : a graphNEL directed graph # Output: # a graph with edges in the opposite direction w.r.t. to g. compute.flipped.graph <- function(g){ ed <- edges(g); ndL <- vector(mode="list", length=length(ed)); names(ndL) <- names(ed); for(i in 1:length(ed)){ children <- ed[[i]]; parent <- names(ed[i]); if(length(children)!=0){ for(j in 1:length(children)){ ndL[[children[j]]] <- c(ndL[[children[j]]],parent); # memorizzo per ogni nodo i rispettivi genitori } } } for (i in 1:length(ndL)){ ndL[[i]] <- list(edges=ndL[[i]]); } og <- graphNEL(nodes=nodes(g), edgeL=ndL, edgemode="directed"); return(og); } # N.B.: computa la stessa cosa di compute.flipped.graph: da valutare qual e' la versione piu' veloce ... # It computes a directed graph with edges in the opposite direction. # Input: # g : a graphNEL directed graph # Output: # a graph with edges in the opposite direction w.r.t. to g. compute.opposite.graph <- function(g){ # if ( !identical(g@edgemode,"directed")) # stop("compute.opposite.graph: the graph must be directed"); v <- nodes(g); edge.list <- vector("list", length = length(v)); names(edge.list) <- v; #browser(); l <- adj(g, v); for (i in 1:length(l)) { x <- l[[i]]; if (length(x)>0){ for (j in 1:length(x)){ edge.list[[x[j]]] <- c(edge.list[[x[j]]], names(l[i])); } } } for (i in 1:length(edge.list)){ edge.list[[i]] <- list(edges=edge.list[[i]]); } g.opposed <- graphNEL(nodes = v, edgeL = edge.list, edgemode = "directed"); return(g.opposed); } # Function to compute the parents of each node of a graph # Input: # g: a graph of class graphNEL. It represents the hierarchy of the classes. # root : name of the root node (def. 00) # Output: # a named list of character vectors. Each component corresponds to a node x of the graph (i.e. child node) and its vector is # the set of its parents (the root node is not included) get.parents <- function(g, root="00") { nd <- nodes(g) ndL <- vector(mode="list", length=length(nd)); names(ndL) <- nd; ed <- edges(g); for(i in 1:length(ed)){ children <- ed[[i]]; parent <- names(ed[i]); if(length(children)!=0){ for(j in 1:length(children)){ ndL[[children[j]]] <- c(ndL[[children[j]]],parent); # memorizzo per ogni nodo i rispettivi genitori } } } ndL <- ndL[-which(names(ndL)==root)]; # si elimina il nodo root return(ndL); } # Function to compute the parents of each node of a graph by a per-level visiting from top to bottom # Input: # g: a graph of class graphNEL. It represents the hierarchy of the classes. # levels: a list of character vectors. Each component rapresents a graph level and the elements of any component correspond to nodes. # The level 0 coincides with the root node. It can be obtained by calling levels.graph function. # root : name of the root node (def. 00) # Output: # a named list of character vectors. Each component corresponds to a node x of the graph (i.e. child node) and its vector is # the set of its parents. The nodes order follows the levels of the graph from root (exluded) to leaves. get.parents.top.down <- function(g,levels, root="00") { ord.nd <- unlist(levels); # ordered character vector con i nomi dei nodi ordinati dalla root (inclusa) ai nodi foglia ndL <- vector(mode="list", length=length(ord.nd)); names(ndL) <- ord.nd; ed <- edges(g); for(i in 1:length(ed)){ children <- ed[[i]]; parent <- names(ed[i]); if(length(children)!=0){ for(j in 1:length(children)){ ndL[[children[j]]] <- c(ndL[[children[j]]],parent); # memorizzo per ogni nodo i rispettivi genitori } } } ndL <- ndL[-which(names(ndL)==root)]; # si elimina il nodo root return(ndL); } # Function to compute the parents of each node of a graph by a per-level visiting from bottom to top # Input: # g: a graph of class graphNEL. It represents the hierarchy of the classes. # levels: a list of character vectors. Each component rapresents a graph level and the elements of any component correspond to nodes. # The level 0 coincides with the root node. It can be obtained by calling levels.graph function. # root : name of the root node (def. 00) # Output: # a named list of character vectors. Each component corresponds to a node x of the graph (i.e. child node) and its vector is # the set of its parents. The nodes are ordered from leaves to root (excluded). get.parents.bottom.up <- function(g,levels, root="00") { flip.ord.nd <- rev(unlist(levels)); # ordered character vector con i nomi dei nodi ordinati dai nodi foglia alla root (inclusa) ndL <- vector(mode="list", length=length(flip.ord.nd)); names(ndL) <- flip.ord.nd; ed <- edges(g); for(i in 1:length(ed)){ children <- ed[[i]]; parent <- names(ed[i]); if(length(children)!=0){ for(j in 1:length(children)){ ndL[[children[j]]] <- c(ndL[[children[j]]],parent); # memorizzo per ogni nodo i rispettivi genitori } } } ndL <- ndL[-which(names(ndL)==root)]; # si elimina il nodo root return(ndL); } # Function to compute the descendants of each node of a graph # Input : # g: a graph of class graphNEL. It represents the hierarchy of the classes. # Output: # a named list of vectors. Each component corresponds to a node x of the graph, and its vector is the set of its descendants including also x. build.descendants <- function(g) { name.nodes <- nodes(g); g2 <- transitive.closure(g); desc <- edges(g2); for(x in name.nodes){ desc[[x]] <- c(desc[[x]],x); } return(desc); } # Function to compute the descendants of each node of a graph by a per-level visiting, from top to bottom # Input: # g: a graph of class graphNEL. It represents the hierarchy of the classes. # levels: a list of character vectors. Each component rapresents a graph level and the elements of any component correspond to nodes. # The level 0 coincides with the root node. It can be obtained by calling levels.graph function. # Output: # a named list of vectors. Each component corresponds to a node x of the graph and its vector is the set of its descendants including also x. # The nodes are ordered from root (included) to leaves. build.descendants.per.level <- function(g,levels) { ord.nd <- unlist(levels); g2 <- transitive.closure(g); desc <- edges(g2)[ord.nd]; for(x in ord.nd){ desc[[x]] <- c(desc[[x]],x); } return(desc); } # Function to compute the descendants of each node of a graph by a per-level visiting, from bottom to top # Input: # g: a graph of class graphNEL. It represents the hierarchy of the classes. # levels: a list of character vectors. Each component rapresents a graph level and the elements of any component correspond to nodes. # The level 0 coincides with the root node. It can be obtained by calling levels.graph function. # Output: # a named list of vectors. Each component corresponds to a node x of the graph and its vector is the set of its descendants including also x. # The nodes are ordered from root (included) to leaves. build.descendants.bottom.up <- function(g,levels) { flip.ord.nd <- rev(unlist(levels)); g2 <- transitive.closure(g); desc <- edges(g2)[flip.ord.nd]; for(x in flip.ord.nd) desc[[x]] <- c(desc[[x]],x); return(desc); } # Function to compute the children of each node of a graph # Input : # g : a graph of class graphNEL. It represents the hierarchy of the classes. # Output: # a named list of vectors. Each component corresponds to a node x of the graph, and its vector is the set of its children build.children <- function(g) { return(edges(g)); } # Function to compute the children of each node of a graph by a per-level visiting, from top to bottom # g: a graph of class graphNEL. It represents the hierarchy of the classes. # levels: a list of character vectors. Each component rapresents a graph level and the elements of any component correspond to nodes. # The level 0 coincides with the root node. It can be obtained by calling levels.graph function. # Output: # a named list of character vectors. Each component corresponds to a node x of the graph (i.e. parent node) and its vector is # the set of its children. The nodes are ordered from root (included) to leaves. get.children.top.down <- function(g,levels){ child <- build.children(g) nd <- c(); for(i in 1:length(levels)){ level.nodes <- levels[[i]]; nd <- append(nd,child[level.nodes]); } return(nd); } # Function to compute the children of each node of a graph by a per-level visiting, from bottom to top # g: a graph of class graphNEL. It represents the hierarchy of the classes. # levels: a list of character vectors. Each component rapresents a graph level and the elements of any component correspond to nodes. # The level 0 coincides with the root node. It can be obtained by calling levels.graph function. # Output: # a named list of character vectors. Each component corresponds to a node x of the graph (i.e. parent node) and its vector is # the set of its children. The nodes are ordered from leaves (included) to root. get.children.bottom.up <- function(g,levels){ ed <- edges(g); # in alternativa si puo' usare la funzione build.children (vedere file "graph.utils" Valentini); GV: no e' meglio usare edges nd <- c(); n.levels <- length(levels); for(i in n.levels:1){ level.nodes <- levels[[i]]; nd <- append(nd,ed[level.nodes]); } return(nd); } # Function to compute the ancestor of each node of a graph # Input: # g: a graph of class graphNEL. It represents the hierarchy of the classes. # Output: # a named list of vectors. Each component corresponds to a node x of the graph, # and its vector is the set of its ancestors including also x. build.ancestors <- function(g){ og <- compute.flipped.graph(g); names.nodes <- nodes(og); og2 <- transitive.closure(og); anc <- edges(og2); for(x in names.nodes){ anc[[x]] <- c(anc[[x]],x); } return(anc); } # MN: the same of above, but the input parameter 'itself' is added. # It's a boolean value: if TRUE in the output list the node x is included in the ancestors of node x, FALSE otherwise build.ancestors2 <- function(g,itself=TRUE){ og <- compute.flipped.graph(g); names.nodes <- nodes(og); og2 <- transitive.closure(og); anc <- edges(og2); if(itself){ for(x in names.nodes){ anc[[x]] <- c(anc[[x]],x); } return(anc); }else{ return(anc); } } # Function to compute the ancestors of each node of a graph per level # Input : # g: a graph of class graphNEL. It represents the hierarchy of the classes. # levels: a list of character vectors. Each component rapresents a graph level and the elements of any component correspond to nodes. # The level 0 coincides with the root node. It can be obtained by calling levels.graph function. # root: name of the class that is on the top-level of the hierarchy (def:"00") # Output: # a named list of vectors. Each component corresponds to a node x of the graph and its vector is the set of its ancestors including also x. # The nodes are ordered from root (included) to leaves. build.ancestors.per.level <- function(g,levels, root="00"){ og <- compute.flipped.graph(g); ord.nd <- unlist(levels); og2 <- transitive.closure(og); anc <- edges(og2)[ord.nd]; for(x in ord.nd) anc[[x]] <- c(anc[[x]],x); return(anc); } # Function to compute the ancestors of each node of a graph from bottom to top # Input : # g: a graph of class graphNEL. It represents the hierarchy of the classes. # levels: a list of character vectors. Each component rapresents a graph level and the elements of any component correspond to nodes. # The level 0 coincides with the root node. It can be obtained by calling levels.graph function. # root: name of the class that is on the top-level of the hierarchy (def:"00") # Output: # a named list of vectors. Each component corresponds to a node x of the graph and its vector is the set of its ancestors including also x. # The nodes are ordered from leaves to root (included). build.ancestors.bottom.up <- function(g,levels, root="00"){ og <- compute.flipped.graph(g); flip.ord.nd <- rev(unlist(levels)); og2 <- transitive.closure(og); anc <- edges(og2)[flip.ord.nd]; for(x in flip.ord.nd) anc[[x]] <- c(anc[[x]],x); return(anc); } # Function to find the node that is on the top-level of hierarchy (i.e. root node) # Input: # g : a graph of class graphNEL. It represents the hierarchy of the classes. # Output: # name of the root node root.node <- function(g){ d <- degree(g); root <- names(which(d$inDegree==0)); return(root); } root.node.old <- function(g){ og <- compute.flipped.graph(g); root <- find.leaves(og); return(root); } # Function to compute the leaves of a graph # Input : # g : a graph of class graphNEL. It represents the hierarchy of the classes. # Output: # a vector with the names of the leaves of g find.leaves <- function(g){ d <- degree(g); leaves <- names(which(d$outDegree==0)); return(leaves); } find.leaves.old <- function(g){ child <- edges(g); leaves <- c(); for(i in 1:length(child)){ if(length(child[[i]])==0) leaves <- append(leaves,names(child[i])); } return(leaves); } # Function to find root and leaves using in/out degree. The graph must be direct. # Leaves are nodes with outdegree equal to zero; root is the node with indegree equal to zero. # Input : # g : a graph of class graphNEL. It represents the hierarchy of the classes. It must be a direct graph. # deg: can assume two values: 1. "indeg": nodes have no in coming edges are returned: root node is found. # 2. "outdeg": nodes have no out coming edges are returned: leaves nodes are found. # Output: # a vector with the names of the leaves or the root name of g root.or.leaf <- function(g,deg){ d <- degree(g); if(deg=="indeg"){ root <- names(which(d$inDegree==0)); return(root); } if(deg=="outdeg"){ leaves <- names(which(d$outDegree==0)); return(leaves); } } # Function to compute the distances of each node of the graph from the leaves # It returns the minimum distance of each node from one of the leaves of the graph # Input : # g : a graph of class graphNEL. It represents the hierarchy of the classes. # Output: # a named vector. The names are the names of the nodes of the graph g, and their values represent the distance from the leaves. # A value equal to 0 is assigned to the leaves, 1 to nodes with distance 1 from a leaf and so on. distances.from.leaves <- function(g){ leaves <- find.leaves(g); n.leaves <- length(leaves); og <- compute.flipped.graph(g); og <- addNode("root", og) # for (x in leaves) # og = addEdge("root", x, og, 1); og <- addEdge(rep("root",n.leaves), leaves, og, rep(1,n.leaves)); dist <- acc(og,"root")[[1]]-1; return(dist); } # It computes the pairwise constraints matrix which needs to have 2 columns and as many rows as constraints. Referring to DAG this matrix must # define a partial order. The specification is always as follow: element column 2 \leq element column 1, e.g the first row states x_1 \leq x_2. # It follows that the first column includes the set of the children nodes whereas the second column includes the set of parents nodes. # In the constraints matrix the children and the parents nodes are specified by a number and not by their names. # Input: # g: a graph of class graphNELL. Represent the hierarchy of the class # Output: # a constraints matrix w.r.t the graph g constraints.matrix <- function(g){ eM <- t(edgeMatrix(g)); eM <- cbind(eM[,2],eM[,1]); nd <- nodes(g); dimnames(eM) <- list(nd[eM[,2]], c("child","parent")) return(eM); } #GV # It is equivalent to the previous one but no transposition is needed ... This could be faster on big graphs. constraints.matrix2 <- function(g){ eM <- edgeMatrix(g); eM <- cbind(eM[2,],eM[1,]); nd <- nodes(g); dimnames(eM) <- list(nd[eM[,2]], c("child","parent")) return(eM); } constraints.matrix.old <- function(g){ edL <- edgeL(g); parent <- c(); for(i in 1:length(edL)) { x <- rep(i,length(edL[[i]][[1]])); parent <- append(parent,x); } child <-c(); for(i in 1:length(edL)){ child <- append(child,edL[[i]][[1]]); } eM <- cbind(child,parent); nd <- nodes(g) rownames(eM) <- nd[parent]; return(eM); } # Conctruction of Weighted Adjacency Matrix # Input: # file : name of the text file to be read. The format of the file is a sequence of rows. # Each row corresponds to an edge represented through a pair of vertices separated by blanks and the weight of the edge e.g.: # nodeX nodeY score # nodeZ nodeK score # and so on... # nodename: boolean value: TRUE if the names of nodes are gene symbol (i.e. characters) # FALSE if the names of the nodes are entrez gene ID (i.e. integer numbers) # Note: this boolean value is needed to get names of rows and columns of wadj matrix sorted in ascending order, both if they are # characters or numbers. Indeed if names of rows (or colums) are number but they are stored as characters, the default "sort" function in R # sorts the characters-like-numebers by considering only the first charachter (a kind of alphabethic order). For instances: # x <- as.character(c(10,1,9)) --> sort(x) = "1" "10" "9". # This can be easily solved: 1.transforming numbers-like-characters as integers # 2.sorting integers # 3.transforming sorted integers numbers again as characters # Output: # a named symmetric adjacency weighted matrix of the graph weighted.adjacency.matrix <- function(file="edges.txt", nodename=TRUE) { m <- as.matrix(read.table(file, colClasses="character")); if(nodename){ nodes <- sort(unique(as.vector(m[,1:2]))); #unique nodes }else{ nodes <- as.character(sort(as.numeric(unique(as.vector(m[,1:2]))))); } n.nodes <- length(nodes); # building the adjacency matrix W <- matrix(0, nrow=n.nodes, ncol=n.nodes); dimnames(W) <- list(nodes,nodes); W[cbind(m[,1], m[,2])] <- as.numeric(m[,3]); W[cbind(m[,2], m[,1])] <- as.numeric(m[,3]); return(W); } # NOTA MN: *BUG* della funzione: definita una una coppia di vertici come intA e intB (i.e. interattore A e B) la funzione sottostante considera # come nodi univoci solo quelli di intA. Questa funzione ritorna un output corretto solo se il numero univoco di intA eguaglia quello di intB. # Ad esempio nel database String (9.1) il numero univoco di intA = intB, mentre nel db Biogrid (3.2.106) il numero univoco di intA ≠ intB. # Di conseguenza si perderà informazione sul numero totale univoco dei nodi. # Conctruction of Weighted Adjacency Matrix # Input: # file : name of the text file to be read. The format of the file is a sequence of rows. # Each row corresponds to an edge represented through a pair of vertices separated by blanks and the weight of the edge e.g.: # nodeX nodeY score # nodeZ nodeK score # and so on... # nodename: boolean value: if TRUE the names of nodes are characters (i.e. gene symbol), # if FALSE the names of the nodes are integer number (i.e. entrez gene ID) # Output: # a named symmetric adjacency weighted matrix of the graph weighted.adjacency.matrix.BUG <- function(file="edges.txt", nodename=TRUE) { m <- read.table(file, colClasses=c("character","character","numeric")); if(nodename){ nodes <-sort(unique(m[,1])); # unique Nodes ##MN: BUG is here!! we consider only the unique node of intA!! }else{ nodes <-sort(unique(as.numeric(m[,1]))); # unique nodes nodes <- as.character(nodes); } n.nodes <- length(nodes); # building the adjacency matrix W <- matrix(0, nrow=n.nodes, ncol=n.nodes); dimnames(W) <- list(nodes,nodes); W[cbind(m[,1], m[,2])] <- m[,3]; W[cbind(m[,2], m[,1])] <- m[,3]; return(W); } # Function to construct the matrix of the most specific annotations # Input: # file: text file representing the most specific associations gene-HPO term (def: "gene2pheno.txt"). # file format: The file must be written as sequence of rows. Each row represent a gene and *all* its # associations with abnormal phenotype separated by blanks, e.g.: # gene_1phen1...phen_N # . . # . . # gene_Mphen1...phen_N # Note: this input file can be get parsing the file "ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt" with the perl sybroutine "Do.HPO.edges.pl" # Output: # the annotation matrix of the most specific annotations (0/1): rows are genes and colums are HPO terms. specific.annotation.matrix <- function(file="gene2pheno.txt"){ line <- readLines(file); tmp <- strsplit(line, split="[ +\t]"); gene.names <- c(); for(i in 1:length(tmp)) gene.names <- c(gene.names,tmp[[i]][1]); ann.list <- list(); for(i in 1:length(tmp)) ann.list[[i]] <- tmp[[i]][-1]; names(ann.list) <- gene.names; hpoID <- unique(unlist(ann.list)); n.genes <- length(gene.names); n.hpoID <- length(hpoID); m <- matrix(integer(n.genes * n.hpoID), nrow=n.genes); rownames(m) <- gene.names; colnames(m) <- hpoID; for (i in gene.names){ spec.ann <- ann.list[[i]]; m[i, spec.ann] <- 1; } m <- m[,sort(colnames(m))]; return(m); } ## older and slight slower version.. specific.annotation.matrix.OLD <- function(file="gene2pheno.txt"){ line <- readLines(file); tmp <- strsplit(line, split="[ +\t]"); gene <- c(); for(i in 1:length(tmp)) gene <- c(gene,tmp[[i]][1]); ann.list <- list(); for(i in 1:length(tmp)) ann.list[[i]] <- tmp[[i]][-1]; names(ann.list) <- gene; hpo <- c(); for(i in 1:length(ann.list)) hpo <- c(hpo,ann.list[[i]]); hpo <- unique(hpo); ann <- matrix(0,length(gene),length(hpo)); dimnames(ann) <- list(gene,hpo); for(gn in rownames(ann)) ann[gn,ann.list[[gn]]] <- 1; ann <- ann[,sort(colnames(ann))]; return(ann); } # Function to construct a list of the most specific annotations form the table of the most specific annotations # Input: # ann : annotation matrix. Rows are examples and columns are most specific terms. It must be a named matrix # Output: # a list : names correspond to examples (genes). Elements are the most specific classes associated to that genes. specific.annotation.list <- function(ann){ ann.list <- apply(ann, 1, function(gene){ terms <- which(gene==1); return(names(gene[terms])) }); return(ann.list); } # Function to perform the transitive closure of the annotations using the most specific annotations and its ancestors. # The annotations are propagated from bottom to top, enriching the most specific annotations table. # The rows of the matrix correspond to the genes of the most specific annotation table and the colums to terms/classes # Input: # ann.spec: the annotation matrix of the most specific annotations (0/1): rows are genes and colums are HPO terms. # It can be obtained by calling specific.annotation.matrix function # anc: list of the ancestors of the ontology. It can be obtained by calling build.ancestors function # Output: # an annotation table T: rows correspond to genes and columns to HPO terms. T[i,j]=1 means that gene i is annotated for the term j, # T[i,j]=0 means that gene i is not annotated for the term j. transitive.closure.annotations <- function(ann.spec, anc){ ## costructiion of annotation list ann.list <- specific.annotation.list(ann.spec); ## cotruction the full empty annotation matrix entrezIDs <- rownames(ann.spec); n.genes <- length(entrezIDs); HPOIDs <- names(anc); n.HPOID <- length(anc); hpo.ann <- matrix(numeric(n.HPOID * n.genes), nrow=n.genes, ncol=n.HPOID); #empty label matrix dimnames(hpo.ann) <- list(entrezIDs,HPOIDs); ## fill the full empty annotation matrix with the most specific annotation hpo.spec.term <- colnames(ann.spec); # the most specific hpo terms # might happen that there are same HPO IDs that are classified as "obsolete" in obo file, but that still exist in the annotation file hpo.spec.term.sel <- HPOIDs[HPOIDs %in% hpo.spec.term]; # removing obosolete HPO terms... hpo.ann[entrezIDs,hpo.spec.term.sel] <- ann.spec[,hpo.spec.term.sel]; ## transitive closure: annotation propagation from the most specific nodes to all its ancestors for (i in entrezIDs){ spec.ann <- ann.list[[i]]; all.anc <- lapply(spec.ann, function(x) return(anc[[x]])); all.anc <- unique(unlist(all.anc)); hpo.ann[i, all.anc] <- 1; # setting the annotations derived by transitive closure } ## remove HPO empty terms hpo.ann <- hpo.ann[,colSums(hpo.ann)!=0]; #x <- apply(hpo.ann,2,sum); #ind.hpo.id <- which(x==0); #hpo.ann <- hpo.ann[,-ind.hpo.id]; return(hpo.ann); } # Function to construct the full annotations table using the most specific annotations and its ancestors w.r.t. a given weighted adiacency matrix. # The rows of the full annotations matrix correspond to all the examples of the given weighted adiacency matrix and the columns to the class/terms. # The transitive closure of the annotations is performed. # VERY IMPORTANT NOTE: examples that are present in the annotation matrix (ann.spec) but not in the symmetric adjacency weighted matrix (W) are purged # Input: # W: symmetric adjacency weighted matrix of the graph # anc: list of the ancestors of the ontology. It can be obtained by calling build.ancestors function # ann.spec: the annotation matrix of the most specific annotations (0/1): rows are genes and colums are HPO terms. # It can be obtained by calling specific.annotation.matrix function # Output: # a full annotation table T: rows correspond to genes and columns to HPO terms. T[i,j]=1 means that gene i is annotated for the term j, # T[i,j]=0 means that gene i is not annotated for the term j. full.annotation.matrix <- function(W, anc, ann.spec){ ## costructiion of annotation list ann.list <- specific.annotation.list(ann.spec); ## cotruction the full empty annotation matrix entrezIDs <- rownames(W); n.genes <- length(entrezIDs); HPOIDs <- names(anc); n.HPOID <- length(anc); hpo.ann <- matrix(numeric(n.HPOID * n.genes), nrow=n.genes, ncol=n.HPOID); #empty label matrix dimnames(hpo.ann) <- list(entrezIDs,HPOIDs); ## fill the full empty annotation matrix with the most specific annotation entrezIDs2hpo <- rownames(ann.spec); # all genes that are associated with hpo terms entrezIDs.sel <- entrezIDs[entrezIDs %in% entrezIDs2hpo]; # genes 2 hpo terms 2 entrez id of wadj hpo.spec.term <- colnames(ann.spec); # the most specific hpo terms #might happen that there are same HPO IDs that are classified as "obsolete" in obo file, but that still exist in the annotation file (e.g. build 1233) hpo.spec.term.sel <- HPOIDs[HPOIDs %in% hpo.spec.term]; # removing obosolete HPO terms... hpo.ann[entrezIDs.sel,hpo.spec.term.sel] <- ann.spec[entrezIDs.sel,hpo.spec.term.sel]; # setting the most specific annotations ## transitive closure: annotation propagation from the most specific nodes to all its ancestors for (i in entrezIDs){ spec.ann <- ann.list[[i]]; all.anc <- lapply(spec.ann, function(x) return(anc[[x]])); all.anc <- unique(unlist(all.anc)); hpo.ann[i, all.anc] <- 1; # setting the annotations derived by transitive closure } ## remove HPO empty terms hpo.ann <- hpo.ann[,colSums(hpo.ann)!=0]; # faster way than below # x <- apply(hpo.ann,2,sum); # ind.hpo.id <- which(x==0); # hpo.ann <- hpo.ann[,-ind.hpo.id]; return(hpo.ann); } # Function to build an annotation matrix with only those terms having more than n annotations (n included). # In other words terms having less than n annotations are pruned. # Input: # hpo.ann: the full annotations matrix (0/1). It can be obtained by calling 'full.annotation.matrix' function # n: number of annotations to be pruned (n is not included) # Output: # Matrix of annotations having only those terms with more than n annotations (n included) do.submatrix <- function(hpo.ann,n){ hpo.ann.sel <- hpo.ann[,colSums(hpo.ann)>n]; return(hpo.ann.sel); } # Function to build a subgraph using only those classes present in an annotation matrix # Input: # hpo.ann: the full annotations matrix (0/1). It can be obtained by calling 'full.annotation.matrix' function # hpo.univ: a universal graph of class graphNELL. Represent the hierarchy of the whole class # Output: # a subgraph of the supplied classes and any edges between them do.subgraph <- function(hpo.ann, hpo.univ){ nodes.ann <- colnames(hpo.ann); hpo <- subGraph(nodes.ann,hpo.univ); return(hpo); } ##MN: computa la stessa cosa di do.subgraph, ma in modo leggermente più lento... ## Test: dal grafo universo si costruisce il sotto-grafo avente come nodi soli quelli che hanno almeno un associazione ad ## un fentipo patologico (termini della full annotation matrix). ## versione file obo: data-version: releases/2016-04-01 ## versione file annotazioni: Build #110 (Jan 25, 2016 7:00:22 PM) ## tempi di calcolo: ## system.time(do.subgraph(hpo.ann,hpo.univ)): 0.574 sec ## system.time(do.my.subgraph(hpo.ann,hpo.univ)): 1.154 sec do.my.subgraph <- function(hpo.ann, g){ nd <- colnames(hpo.ann); ed <- edges(g); ed.sel <- ed[nd]; ndL <- vector(mode="list", length=length(ed.sel)); names(ndL) <- names(ed.sel); for(i in 1:length(ed.sel)){ parent <- names(ed.sel[i]); children <- ed.sel[[i]]; if(length(children!=0)){ children.map <- children[children %in% nd] ndL[[i]] <- append(ndL[[i]],children.map); } } for (i in 1:length(ndL)){ ndL[[i]] <- list(edges=ndL[[i]]); } G <- graphNEL(nodes=nd, edgeL=ndL, edgemode="directed"); return(G); } # Function to assess the integrity of a full annotation table # Input: # anc: list of the ancestors of the ontology. It can be obtained by calling build.ancestors function # ann.spec: the annotation matrix of the most specific annotations (0/1): rows are genes and colums are HPO terms. # It can be obtained by calling 'specific.annotation.matrix' function # hpo.ann: the full annotations matrix (0/1). It can be obtained by calling 'full.annotation.matrix' function # Output: # If the transitive closure of the annotations is performed well (i.e. if the full annotation matrix is right) "OK" is printed, # otherwise a message error is printed check.annotation.matrix.integrity <- function(anc, ann.spec, hpo.ann){ ## costructiion of annotation list ann.list <- specific.annotation.list(ann.spec); genes <- rownames(hpo.ann); check <- c(); for (i in genes){ spec.ann <- which(hpo.ann[i,]==1); len.ann <- length(spec.ann); all.anc <- lapply(ann.list[[i]], function(x) return(anc[[x]])); all.anc <- unique(unlist(all.anc)); len.anc <- length(all.anc); cmp <- len.anc == len.ann; if(cmp==TRUE){ check <- c(check,"OK") } else { check <- c(check,"NOTOK") } } names(check) <- genes; violated <- any(check!="OK"); if(violated){ n <- names(check)[check=="NOTOK"]; cat("check.annotation.matrix: NOT_OK. Transitive closure NOT RESPECTED", "\n"); }else{ cat("check.annotation.matrix: OK", "\n"); } } # Function to assess the integrity of a DAG # If there are nodes not accessible from the root node a message error is displayed # Input: # g : a graph of class graphNEL. It represents the hierarchy of the classes. # root: name of the class that is on the top-level of the hierarchy (def:"00") # Output: # If there are no nodes not accessible from the root, then "OK" is printed, # otherwise a message error and the list of the not accessible nodes is printed. check.DAG.integrity <- function(g, root="00") { all.nodes <- nodes(g); acc.nodes <- names(acc(g,root)[[1]]); if((length(all.nodes) - length(acc.nodes)) > 1) { n <- setdiff(all.nodes,c(acc.nodes,root)); cat("check.GO.integrity: not all nodes accessible from root", "\n"); cat("Nodes not accessible from root: \n"); cat(n,"\n"); }else{ cat("OK \n") }; } # Function to check the hierarchical constraints of the prediction. It checkes whether the predictions verify the hierarchy constraints, that is # the score of a child node must not be greater than the score of its parents. # The algorithm can be applied to any DAG-structured hierarchy of classes # Input: # S.hier: the matrix with the scores of the classes corrected in according to hierarchy. # g: a graph of class graphNEL. It represents the hierarchy of the classes. # root: name of the class that is on the top-level of the hierarchy (def:"00") # Output: # a list of 3 elemets: # - Status: a single value: OK if none hierarchical constraints have bee broken, NOTOK if there is at least one hierarchical constraints broken; # - Hierarchy_Constraints_Broken: boolean values vector: for each terms/classes are possible two value: TRUE if there is some example did not # respect the hierarchical constraints; FALSE if none sample broke the hierarchical constraints. # - Hierarchy_costraints_satisfied: how many terms satisfied the hierarchical costraint. check.hierarchy <- function(S.hier,g, root="00"){ # si aggiunge una "dummy root column" se non presente nella matrice S.hier if(!(root %in% colnames(S.hier))){ max.score <- max(S.hier); z <- rep(max.score,nrow(S.hier)); S.hier <- cbind(z,S.hier); colnames(S.hier)[1] <- root; } par <- get.parents(g,root); v <- c() for(i in 1:length(par)){ child <- S.hier[,names(par[i])]; parents <- S.hier[,par[[i]]] x <- parents >= child # in alternativa: child <= parents y <- any(x==0) # check hierarchy constraints v <- append(v,y) } names(v) <- names(par) violated <- any(v==TRUE); if(violated) Status = "NOTOK" else Status = "OK"; h <- as.factor(v); k <- summary(h); l <- list(Status=Status, hierarchy.constraints.broken=v, hierarchy.costraints.satisfied=k); return(l); } #MN: The same function of above, but istead to get in input the whole hierarchical score matrix, receives in input # the vector of scores relative to a single example # Input: # y.hier: vector of scores relative to a single example. This must be a named numeric vector. # The names must correspond to the names of the classes of the hierarchy # g: a graph of class graphNEL. It represents the hierarchy of the classes. # root: name of the class that is on the top-level of the hierarchy (def:"00") # Output: # a list of 3 elemets: # - Status: a single value: OK if none hierarchical constraints have bee broken, NOTOK if there is at least one hierarchical constraints broken; # - Hierarchy_Constraints_Broken: boolean values vector: for each terms/classes are possible two value: TRUE if there is some example did not # respect the hierarchical constraints; FALSE if none sample broke the hierarchical constraints. # - Hierarchy_costraints_satisfied: how many terms satisfied the hierarchical costraint. check.hierarchy.single.sample <- function(y.hier,g, root="00"){ # si aggiunge il root node se non presente nel vettore y.hier if(!(root %in% names(y.hier))){ max.score <- max(y.hier); y.hier <- c(max.score,y.hier); names(y.hier)[1] <- root; } par <- get.parents(g,root); v <- c() for(i in 1:length(par)){ child <- y.hier[names(par[i])]; parents <- y.hier[par[[i]]] x <- parents >= child # in alternativa: child <= parents y <- any(x==0) # check hierarchy constraints v <- append(v,y) } names(v) <- names(par) violated <- any(v==TRUE); if(violated) Status = "NOTOK" else Status = "OK"; h <- as.factor(v); k <- summary(h); l <- list(Status=Status, hierarchy.constraints.broken=v, hierarchy.costraints.satisfied=k); return(l); } # Function to split a dataset randomly in k fold through an unstratified way. It is useful to perform experiment with k-fold cross-validation # in the contest of hierarchical correction. This function is called inside the high-level function Do.tpr.threshold.cv. # Input: # S: matrix of the flat scores. It must be a named matrix, where rows are example (e.g. genes) and columns are classes/terms. # kk: number of folds in which to split the dataset (def: 5) # seed : seed for the random generator. If NULL (default) no initialization is performed. # Output: # a list with k components. Each element is a vector with a random subset of the name of the row of a matrix S do.unstratified.cv.data <- function(S, kk=5, seed=NULL){ if (!is.null(seed)) set.seed(seed); samples <- rownames(S); n.samples <- nrow(S); size <- c(); folds <- vector(mode="list", length=kk) names(folds) <- paste0(rep("fold",kk), 1:kk) for (k in 1:kk) { first <- ((k - 1) * n.samples) %/% kk last <- (k * n.samples) %/% kk size <- c(size, last-first) x <- sample(samples,size[k]); folds[[k]] <- x; samples <- setdiff(samples,x); } return(folds); } # NOTA GV: non ha senso riportare nei fold delle matrici: si spreca inutilmente spazio: e' sufficiente riportare solo gli indici delle righe e stop # (o al piu' i nomi delle righe). Riscrivi la funzione in modo che ritorni in ogni fold solo gli indici (o i nomi) delle # righe di S (perche' chiamarla S.Flat?). Vedi come e' implementata do.stratified.cv.data in RANKS ed usa nei fold gli # indici (che sono piu' economici) e non le matrici (che sprecano un valanga di spazio) # Function to split a dataset randomly in k fold through an unstratified way. It is useful to perform experiment with k-fold cross-validation # in the contest of hierarchical correction. This function is called inside the high-level function Do.tpr.threshold.cv. # Input: # S.flat: matrix of the flat scores. It must be a named matrix, where rows are example (e.g. genes) and columns are classes/terms. # kk: number of folds in which to split the dataset (def: 5) # seed : seed for the random generator. If NULL (default) no initialization is performed. # Output: # a list with k components where each element is a submatrix: rows are a random subset of genes (i.e. row names of the S.flat matrix) # and colums all terms. do.unstratified.cv.data.old <- function(S.flat, kk=5, seed=NULL){ if (!is.null(seed)) set.seed(seed); samples <- rownames(S.flat); n.samples <- nrow(S.flat); size <- c(); folds <- vector(mode="list", length=kk) names(folds) <- paste0(rep("fold",kk), 1:kk) for (k in 1:kk) { first <- ((k - 1) * n.samples) %/% kk last <- (k * n.samples) %/% kk size <- c(size, last-first) x <- sample(samples,size[k]); folds[[k]] <- S.flat[x,]; samples <- setdiff(samples,x); } return(folds); }