#!/usr/bin/perl

## Copyright (C) 2019 Marco Notaro ## 
## License: Perl5 (https://dev.perl.org/licenses/)

## loading needed perl modules
use strict;
use warnings;
use Pod::Usage; ## https://metacpan.org/pod/Pod::Usage
use Getopt::Long; ## https://metacpan.org/pod/Getopt::Long
use Spreadsheet::Read; ## https://metacpan.org/pod/Spreadsheet::Read
use Time::HiRes qw(time); ## https://metacpan.org/pod/Time::HiRes

## perl help/manual
my %opt=();
GetOptions(\%opt, 'help|?', 'man') or pod2usage(2);
pod2usage(1) if $opt{help};
pod2usage(-exitval => 0, -verbose => 2) if $opt{man};

## begin script
my $start= time;

## input files
my($oncoppigenes, $oncoppi, $outfile)= @ARGV;

## some checks
if($#ARGV!=2){
	die "Number of input arguments required: *3*.\nGet more info by calling help: ./build_oncoppi_ensemble.pl --help\n"; 
}
if($ARGV[0]!~/.xlsx$|.xls$|.ods$|.csv$/){
	die "Wrong file extension.\nExtesnion of first file must be one of the following: .xlsx|.xls|.ods|.csv.\nPlease see help\n";
}
if($ARGV[1]!~/.xgmml$/){
	die "Wrong file extension.\nExtension of the second file must be .xgmml.\nPlease see help\n";
}

# reading input files 
my $spreadsheet= ReadData($oncoppigenes);
my @rows= Spreadsheet::Read::rows($spreadsheet->[1]);

## step1: map gene-symbol_2_ensemble-geneID from xlsx file provided by oncoppi
my %ensemble2genename= ();
foreach my $i (3..scalar(@rows)){ ## the first 2 rows are header, so jump them 
	my $genesymbol= $spreadsheet->[1]{cell}[2][$i];
	my $ensembleid= $spreadsheet->[1]{cell}[4][$i];
	$ensemble2genename{$genesymbol}=$ensembleid;
}

## step2: print oncoppi network in tupla format by using ensemble-geneID as entry
open IN, $oncoppi;
open OUT, "> $outfile";
while(<IN>){
	next if /<graph/;
	next if /<node/;
	if($_=~/label="(\S+)/){
		my $label=$1;
		if($ensemble2genename{$label} ne ""){
			print OUT "$ensemble2genename{$label}\t";
		}
	}
	if($_=~/\(pp\)\s+(\S+)"/){
		my $source=$1;	
		if($ensemble2genename{$source} ne ""){	
			print OUT "$ensemble2genename{$source}\t1\n";
		}
	}
}
close IN;
close OUT;

## time in milliseconds
my $end= time - $start;
my $res = sprintf("%.3f", $end);
print "\ntime elapsed: $res\n\n";

__END__

=head1 NAME
 
 build oncoppi network 

=head1 SYNOPSIS

 perl build_oncoppi_ensemble.pl [options] [file ...]

 Usage:
   perl build_oncoppi_ensemble.pl oncoppi.genes.xlsx oncoppi.cytoscape.xgmml oncoppi.txt
  
=head1 OPTIONS

=over 3

=item B<--help (-h)>
 
 print a brief help message

=item B<--man (-m)>

 print the manual page

=back

=head1 ARGUMENTS

=over 3

=item B<mapping_file>
 
 file containing the mapping between identifiers (eg oncoppi.genes.xlsx)
 file format can be xlsx|xls|ods|csv (note: csv must be comma or semicolon separated);

=item B<oncoppi_network_file>

 file showing protein-protein interactions (PPIs) in oncoppi network
 oncoppi.cytoscape.xgmml: the file format must be xgmml 

=item B<output_file>

 name of the file where redirecting the oncoppi network in tupla format (eg. oncoppi.txt) 

=back

=head1 DESCRIPTION

 Build oncoppi network in tupla format: p1 <tab> p2 <tab> score, where p1/p2 are ensemble gene id and the score is unitary. 
 To this end we parsed the mapping file and the network file provided by oncoppi paper (doi: 10.1038/ncomms14356).
	
=head1 COPYRIGHT LICENSE AND DISCLAIMER
 
 Copyright (C) 2019 Marco Notaro, all rights reserved, 

 This program is free software; you can redistribute it and/or modify it under 
 the terms of the GNU General Public License version 3, GPLv3 (https://www.gnu.org/licenses/gpl-3.0.en.html)
 
 This program is distributed in the hope that it will be useful, but
 without any warranty; without even the implied warranty of
 merchantability or fitness for a particular purpose.

=head1 AUTHORS

 Marco Notaro (https://marconotaro.github.io)

=cut

# yowza yowza yowza.

