##
## sequence_analysis.pm
##
##   functions to perform several analyses on input DNA sequences
##   and providing all-frames translation.
##
package sequence_analysis;

use strict;
use warnings;

## Variables Declaration

use vars qw( @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION
             %CODON_TABLE @DATA );

use Exporter;

$VERSION = 1.00;

@ISA = qw(Exporter);

@EXPORT = qw( &spacer &load_seq &seq_stats &revcomplement &translate );

@EXPORT_OK = qw( ); # to set functions/vars to be exported on demand
%EXPORT_TAGS = ();


###
### MODULE IMPLEMENTATION
###

## Variables Initialization

%CODON_TABLE = ( 'UNDEF' => 1 );


## Functions

sub spacer() { print STDOUT '=' x 80,"\n\n"; }

sub load_seq() { ## Loading Sequence

	my ($id, $seq);

	while (<>) {

		next if /^\#/o;
		next if /^\s*$/o;

		chomp;

		my $line = $_;

		$line =~ /^\>([^\s]+)\s*/o && do {

			$id = defined($1) ? $1 : 'unknown_dna_seq';
			$seq = '';
			next;

		};

		$seq .= $line;

	}; # while STDIN

	return ($id, $seq);

} # load_seq

sub seq_stats() { ## Basic Sequence Features

	my (%NUCLEOTIDES, $id, $rseq, $seqlen, $gc_content);

	($id, $rseq) = @_;

	%NUCLEOTIDES = ();

	$seqlen = length($$rseq);

	for (my $i = 0; $i < $seqlen; $i++) { # Get Nucleotide Counts

		my $char = uc(substr($$rseq, $i, 1));
		$NUCLEOTIDES{$char}++;

	};

	$gc_content = (($NUCLEOTIDES{G} + $NUCLEOTIDES{C}) / $seqlen) * 100;

	printf STDOUT "SEQUENCE: %s\n\n".
		          "--> STATS:\n".
		          "  Length: %dbp\n".
				  "     G+C: %5.2f%%\n\n",
				  $id, $seqlen, $gc_content;

	return $seqlen;

} # seq_stats

sub revcomplement() { ## Reverse Complement

	my ($seqref, $revseq);

	$seqref = shift;
	$revseq = shift;

	($$revseq = reverse $$seqref) =~ tr/acgtACGT/tgcaTGCA/;

} # revcomplement

@DATA = split /\n/o, <<'+++EOD+++';
#
# REMEMBER: Anyting following the '__END__'
#           or '__DATA__' tokens is ignored
#
# Standard Codon Table -- shorter version
TTT F  TTC F  TTA L  TTG L  CTT L  CTC L  CTA L  CTG L
ATT I  ATC I  ATA I  ATG M  GTT V  GTC V  GTA V  GTG V
TCT S  TCC S  TCA S  TCG S  CCT P  CCC P  CCA P  CCG P
ACT T  ACC T  ACA T  ACG T  GCT A  GCC A  GCA A  GCG A
TAT Y  TAC Y  TAA *  TAG *  CAT H  CAC H  CAA Q  CAG Q
AAT N  AAC N  AAA K  AAG K  GAT D  GAC D  GAA E  GAG E
TGT C  TGC C  TGA *  TGG W  CGT R  CGC R  CGA R  CGG R
AGT S  AGC S  AGA R  AGG R  GGT G  GGC G  GGA G  GGG G
+++EOD+++

sub init_codon_table() {

	%CODON_TABLE = ();

	foreach my $line (@DATA) { # Loading Translation Hash

		next if $line =~ m/^\#/o;

		my @F = split /\s+/o, $line;

		for (my $i = 0; $i < scalar(@F); $i+=2) {

			$CODON_TABLE{$F[$i]} = $F[($i + 1)];

		}; # for $i

	}; # while DATA

} # init_codon_table

sub translate() { ## Translate Sequence

	my ($seqref, $fr) = ($_[0], $_[1]); # @_

	(!exists($CODON_TABLE{UNDEF})) || &init_codon_table();

	printf STDOUT "--> %s Translations:\n",
                  (($fr == 0) ? "FORWARD" : "REVERSE");

	my $seqlen = length($$seqref);

	for (my $phase = 0; $phase < 3; $phase++) {

		my $protein_seq = '';

		for (my $i = $phase; $i < $seqlen; $i += 3) {

			my $codon = substr($$seqref,$i,3);

			last if length($codon) < 3;

			if (defined($CODON_TABLE{$codon})) {

				$protein_seq .= $CODON_TABLE{$codon};

			} else {

				$protein_seq .= 'X'; # Any wildcard provided by
		                             # the corresponding annotation standard...

			}; # 

		}; # for $i

		printf STDOUT "   Phase %d : %s\n", $phase, $protein_seq;

	}; # for $phase

	print STDOUT "\n";

}; # translate


### Here finishes your module implementation the following line
###    is mandatory and warns perl that the module has finished
1;
