# March 2016
# April 2016: modified Do.HTD: normalization methods added
# July 2016: added PRC computed by precrec package
# July 2016: added Do.HTD.holdout

#*****************************************************************************************#
# libraries and source files to be loaded: 
 library(PerfMeas)  			   ## compute AUROC and precision at fixed recall rate
 library(precrec)				   ## compute AUPRC
 library(preprocessCore)		   ## Qnorm
 source("flat.score.norm.R")  	   ## Maxnorm
 source("graph.utils.R")  		   ## graph utility functions 
 source("F-hier.R")				   ## compute Kiritchenko-like multi-label F-scores
 source("Do.flat.normalization.R") ## high level function to compute Maxnorm e Qnorm
#*****************************************************************************************#

# High level function to compute hierarchical correction according to HTD algorithm. 
# INPUT: 
# norm: boolean value: 1.TRUE means that the flat scores matrix has been already normalized in according to a normalization method (def);
#       			   2.FALSE means that the flat scores matrix has NOT been normalized yet.
# norm.type: this variable can assume three values: MaxNorm, Qnorm, NONE. We have two case respect to norm:
#			 1.if norm==FALSE, two kind of normalizations are possible:	1. MaxNorm: each score is divided w.r.t. the max of each class 
#																		2. Qnorm: quantile normalization is applied. PreprocessCore library is used.
#			 2.if norm==TRUE, set norm.type=="NONE" (def);
# flat.file: name of flat scores matrix already normalized or to be normalized in according to norm.type (without rda extension);
# ann.file: name of the target labels (without rda extension). It must be  an .rda file containing the label matrix of the examples (def: ann.file)
# dag.file: name of the graph that represents the hierarchy of the classes (def dag.file) 
# flat.dir: relative path to folder where flat scores matrix (already normalized or to normalize) is stored (def flat.dir)
# ann.dir: relative path to folder where annotation matrix is stored
# dag.dir: relative path to folder where graph is stored
# flat.norm.dir: 1.if norm=FALSE, relative path where flat normalized scores matrix is strored;
#				 2.if norm=TRUE, the flat scores matrix is already normalized, than it is set to NULL (def)
# n.round: 	number of rounding digits to be applied to the hierarchical scores matrix (def. 3). 
#			It's used for choosing the best threshold on the basis of the best F.measure (see f.criterion parameter).
# f.criterion: character. Type of F-measure to be used to select the best F.measure. There are 2 possibilities: 
#              1. "F" (default) corresponds to the harmonic mean between the average precision and recall; 
#              2. "avF" corresponds to the per-example F-score averaged across all the examples.
# hierScore.dir: relative path to folder where the matrix with the scores of the classes corrected in according to HTD algorithm is stored 
# macro.dir: relative path to folder where the class-centric measures (i.e. AUC and PxR across classes) are stored
# Fmeas.dir: relative path to folder where example-centric measures (i.e. Precision, Recall, Specificity, F-measure, Accuracy across example) are stored
# OUTPUT:
# 5 rda files stored in the rispective output directory:
# - Matrix with examples on rows and classes on colums representing the hierarchical scores of the classes computed with HTD algorithm.
#	Stored in hierScore.dir folder
# - Example-centric measures computed through find.best.f from F-hier.R file. Stored in macro.dir folder
# - AUC (average and per classes) computed through AUC.single.over.classes from package PerfMeas. Stored in macro.dir
# - Precision at fixed recall levels (average and per classes) computed through precision.at.multiple.recall.level.over.classes from package PerfMeas.
# 	Stored in macro.dir folder
# - PRC (average and per.class) computed by precrec package. Stored in macro.dir
Do.HTD <- function	(	norm=TRUE, norm.type= "NONE", flat.file=flat.file, ann.file=ann.file, dag.file=dag.file,
						flat.dir=flat.dir, ann.dir=ann.dir, dag.dir=dag.dir, flat.norm.dir=NULL, 
						n.round=3, f.criterion ="F", hierScore.dir="hierScore.dir/", macro.dir="macro.dir/", Fmeas.dir="Fmeas.dir/"
					){
	## Loading Data ############
	## loading hpo dag
	dag.path <- paste0(dag.dir, dag.file,".rda");
	hpo <- get(load(dag.path));
	
	##root node
	root <- root.node(hpo);

	## loading flat scores matrix relative to a specific subontology
	flat.path <- paste0(flat.dir, flat.file,".rda");
	if(norm==TRUE){
		S.flat <- get(load(flat.path));
		gc();	##in order to save ram memory..

		## removing root node from flat norm matrix if it exists
		if(root %in% colnames(S.flat)){
			S.flat <- S.flat[,-which(colnames(S.flat)==root)];
		}
	}else{
		Do.FLAT.scores.normalization(	norm.type= norm.type, flat.file=flat.file, ann.file=ann.file,
						 				dag.file=dag.file, flat.dir=flat.dir, ann.dir=ann.dir, 
						 				dag.dir=dag.dir, flat.norm.dir=flat.norm.dir
									)
		flat.path <- paste0(flat.norm.dir, norm.type,".",flat.file,".rda");
		S.flat <- get(load(flat.path));
	}

	## loading annotation matrix
	ann.path <- paste0(ann.dir, ann.file,".rda");
	hpo.ann <- get(load(ann.path));
	gc();

	## removing root node from annotation table 
	ann.no.root <- target <- hpo.ann[,-which(colnames(hpo.ann)==root)];

	## Computing FLAT Performances
	## FLAT AUC computed by PerfMeas package
	AUC.flat <- AUC.single.over.classes(ann.no.root, S.flat, hpo, root=root)[c("average","per.class")];				

	## FLAT PxRs computed by PerfMeas pacakge
	PXR.flat <- precision.at.multiple.recall.level.over.classes(ann.no.root, S.flat);

	## F.measure: Computing Flat Examples-Measures 
	FMM.flat <- find.best.f(ann.no.root, S.flat, n.round=n.round, f.criterion=f.criterion, verbose=FALSE);

	## FLAT PRC computed by precrec package (more precise and accurate than PerfMeas)
	labels <- join_labels(ann.no.root);	##N.B.: if there are some terms with NO annotations, the function return a *stop* mex
	scores.flat <- join_scores(S.flat);
	res.flat <- evalmod(scores=scores.flat, labels=labels, dsids=1:ncol(S.flat), modnames=colnames(S.flat));
	meas.flat <- auc(res.flat);
	prc.flat <- subset(meas.flat, curvetypes == "PRC");
	prc.mean.flat <- mean(prc.flat$aucs);
	PRC.class <- prc.flat$aucs;
	names(PRC.class) <- prc.flat$modnames;
	PRC.flat <- list(average=prc.mean.flat, per.class=PRC.class); #saving PRC result in the same format of package PerfMeas 

	## Hierarchical Top Down Correction ####################
	S.htd <- pred <- htd(S.flat,hpo,root);
	
	## Computing Hier Performances
	## Hierarchical AUC (average and per.class) computed by PerfMeas package
	AUC.htd <- AUC.single.over.classes(ann.no.root, S.htd, hpo, root=root)[c("average","per.class")];
	
	## Hierarchical PxR at fixed recall levels (average and per.class) computed by PerfMeas package
	PXR.htd <- precision.at.multiple.recall.level.over.classes(ann.no.root, S.htd);

	## Computing Hierarchical Examples-Measures 
	FMM.htd <- find.best.f(target, pred, n.round=n.round, f.criterion =f.criterion, verbose=FALSE);	

	## Hierarchical PRC (average and per.class) computed by precrec package
	labels <- join_labels(ann.no.root);	##N.B.: if there are some terms with NO annotations, the function return a *stop* mex
	scores.htd <- join_scores(S.htd);
	res.htd <- evalmod(scores=scores.htd, labels=labels, dsids=1:ncol(S.htd), modnames=colnames(S.htd));
	meas.htd <- auc(res.htd);
	prc.htd <- subset(meas.htd, curvetypes == "PRC");
	prc.mean.htd <- mean(prc.htd$aucs);
	PRC.class <- prc.htd$aucs;
	names(PRC.class) <- prc.htd$modnames;
	PRC.htd <- list(average=prc.mean.htd, per.class=PRC.class); #saving PRC result in the same format of package PerfMeas

	## Storing Results #########
	if(norm==TRUE){
		save(S.htd, file=paste0(hierScore.dir, flat.file, ".hierScores.htd.rda"), compress=TRUE);
		save(AUC.flat, AUC.htd, file=paste0(macro.dir, "AUC.", flat.file,".hierScores.htd.rda"), compress=TRUE);
		save(PXR.flat, PXR.htd, file=paste0(macro.dir, "PXR.", flat.file,".hierScores.htd.rda"), compress=TRUE);
		save(FMM.flat, FMM.htd, file=paste0(Fmeas.dir, "PCM.", flat.file, ".hierScores.htd.rda"), compress=TRUE);
		save(PRC.flat, PRC.htd, file=paste0(macro.dir, "PRC.", flat.file,".hierScores.htd.rda"), compress=TRUE);
	}else{
		save(S.htd, file=paste0(hierScore.dir, norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);	
		save(AUC.flat, AUC.htd, file=paste0(macro.dir, "AUC.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);	
		save(PXR.flat, PXR.htd, file=paste0(macro.dir, "PXR.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);	
		save(FMM.flat, FMM.htd, file=paste0(Fmeas.dir,"PCM.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);
		save(PRC.flat, PRC.htd, file=paste0(macro.dir, "PRC.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);
	}
}

# High level function to correct the scores with a hierarchy according to HTD algorithm performing a classical holdout procedure
# All input paramenters are the same of the function above, except the following:
# ind.test.set: vector of integer. Indices refer to the examples of the adjancency matrix to be used in the test set. 
# ind.dir: relative path to folder where ind.test.set is stored
Do.HTD.holdout <- function	(	norm=TRUE, norm.type= "NONE", flat.file=flat.file, ann.file=ann.file, dag.file=dag.file,
								ind.test.set=ind.test.set, ind.dir=ind.dir, flat.dir=flat.dir, ann.dir=ann.dir,  
								dag.dir=dag.dir, flat.norm.dir=NULL, n.round=3, f.criterion ="F", 
								hierScore.dir="hierScore.dir/", macro.dir="macro.dir/", Fmeas.dir="Fmeas.dir/"
							){
	## Loading Data ############
	# loading examples indices of the test set
	ind.set <- paste0(ind.dir, ind.test.set, ".rda");
	ind.test <- get(load(ind.set));

	## loading hpo dag
	dag.path <- paste0(dag.dir, dag.file,".rda");
	hpo <- get(load(dag.path));
	
	##root node
	root <- root.node(hpo);

	## loading flat scores matrix relative to a specific subontology
	flat.path <- paste0(flat.dir, flat.file,".rda");
	if(norm==TRUE){
		S.flat <- get(load(flat.path));
		gc();	##in order to save ram memory..

		## removing root node from flat norm matrix if it exists
		if(root %in% colnames(S.flat)){
			S.flat <- S.flat[,-which(colnames(S.flat)==root)];
		}
	}else{
		Do.FLAT.scores.normalization(	norm.type= norm.type, flat.file=flat.file, ann.file=ann.file, 	
						 				dag.file=dag.file, flat.dir=flat.dir, ann.dir=ann.dir, 
						 				dag.dir=dag.dir, flat.norm.dir=flat.norm.dir
									)
		flat.path <- paste0(flat.norm.dir, norm.type,".",flat.file,".rda");
		S.flat <- get(load(flat.path));
	}

	## shrinking the size of S.flat to the examples of test set
	S.flat <- S.flat[ind.test,];

	## loading annotation matrix
	ann.path <- paste0(ann.dir, ann.file,".rda");
	hpo.ann <- get(load(ann.path));
	gc();

	## removing root node from annotation table and shrinking the size of annotation table to the examples of test set
	ann.no.root <- target <- hpo.ann[ind.test,-which(colnames(hpo.ann)==root)];

	## Computing FLAT Performances
	## FLAT AUC computed by PerfMeas package
	AUC.flat <- AUC.single.over.classes(ann.no.root, S.flat, hpo, root=root)[c("average","per.class")];				

	## FLAT PxRs computed by PerfMeas pacakge
	PXR.flat <- precision.at.multiple.recall.level.over.classes(ann.no.root, S.flat);

	## F.measure: Computing Flat Examples-Measures 
	FMM.flat <- find.best.f(ann.no.root, S.flat, n.round=n.round, f.criterion=f.criterion, verbose=FALSE);

	## FLAT PRC computed by precrec package (more precise and accurate than PerfMeas)
	labels <- join_labels(ann.no.root);	##N.B.: if there are some terms with NO annotations, the function return a *stop* mex
	scores.flat <- join_scores(S.flat);
	res.flat <- evalmod(scores=scores.flat, labels=labels, dsids=1:ncol(S.flat), modnames=colnames(S.flat));
	meas.flat <- auc(res.flat);
	prc.flat <- subset(meas.flat, curvetypes == "PRC");
	prc.mean.flat <- mean(prc.flat$aucs);
	PRC.class <- prc.flat$aucs;
	names(PRC.class) <- prc.flat$modnames;
	PRC.flat <- list(average=prc.mean.flat, per.class=PRC.class); #saving PRC result in the same format of package PerfMeas 

	## Hierarchical Top Down Correction ####################
	S.htd <- pred <- htd(S.flat,hpo,root);
	
	## Computing Hier Performances
	## Hierarchical AUC (average and per.class) computed by PerfMeas package
	AUC.htd <- AUC.single.over.classes(ann.no.root, S.htd, hpo, root=root)[c("average","per.class")];
	
	## Hierarchical PxR at fixed recall levels (average and per.class) computed by PerfMeas package
	PXR.htd <- precision.at.multiple.recall.level.over.classes(ann.no.root, S.htd);

	## Computing Hierarchical Examples-Measures 
	FMM.htd <- find.best.f(target, pred, n.round=n.round, f.criterion =f.criterion, verbose=FALSE);	

	## Hierarchical PRC (average and per.class) computed by precrec package
	labels <- join_labels(ann.no.root);	##N.B.: if there are some terms with NO annotations, the function return a *stop* mex
	scores.htd <- join_scores(S.htd);
	res.htd <- evalmod(scores=scores.htd, labels=labels, dsids=1:ncol(S.htd), modnames=colnames(S.htd));
	meas.htd <- auc(res.htd);
	prc.htd <- subset(meas.htd, curvetypes == "PRC");
	prc.mean.htd <- mean(prc.htd$aucs);
	PRC.class <- prc.htd$aucs;
	names(PRC.class) <- prc.htd$modnames;
	PRC.htd <- list(average=prc.mean.htd, per.class=PRC.class); #saving PRC result in the same format of package PerfMeas

	## Storing Results #########
	if(norm==TRUE){
		save(S.htd, file=paste0(hierScore.dir, flat.file, ".hierScores.htd.rda"), compress=TRUE);
		save(AUC.flat, AUC.htd, file=paste0(macro.dir, "AUC.", flat.file,".hierScores.htd.rda"), compress=TRUE);
		save(PXR.flat, PXR.htd, file=paste0(macro.dir, "PXR.", flat.file,".hierScores.htd.rda"), compress=TRUE);
		save(FMM.flat, FMM.htd, file=paste0(Fmeas.dir, "PCM.", flat.file, ".hierScores.htd.rda"), compress=TRUE);
		save(PRC.flat, PRC.htd, file=paste0(macro.dir, "PRC.", flat.file,".hierScores.htd.rda"), compress=TRUE);
	}else{
		save(S.htd, file=paste0(hierScore.dir, norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);	
		save(AUC.flat, AUC.htd, file=paste0(macro.dir, "AUC.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);	
		save(PXR.flat, PXR.htd, file=paste0(macro.dir, "PXR.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);	
		save(FMM.flat, FMM.htd, file=paste0(Fmeas.dir,"PCM.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);
		save(PRC.flat, PRC.htd, file=paste0(macro.dir, "PRC.", norm.type,".", flat.file, ".hierScores.htd.rda"), compress=TRUE);
	}
}