#!/usr/bin/perl
use strict;
use Getopt::Std;

##################################
#
#  Constructor.pl
#
#  INPUT: abs.gff, tf.dat, PID
#
#  STDOUT: sequences and TFBSs
#  OUTPUT_FILE: TFBSs for image
##################################

### CONSTANTS
my $TRUE=1;
my $FALSE=0;
my $MIN_DISTANCE=50;
my $LFRAGMENT;
my ($A,$C,$G,$T) = ("a","c","g","t");
my $NO_MOTIF="-";
my $LFASTA=60;
my $GAME="CONSTRUCTOR";


### STEP 0. READING VALUES AND OPTIONS
my %opts;
my ($abs_file,$tf_file,$PID);
my ($LENGTH,$NSEQS,$NMOTIFS,$PPLANT,$SPECIES,$ACONTENT,$CCONTENT,$GCONTENT,$TCONTENT);
my $OUTPUT_FILE;                                                               

getopts('L:N:M:P:S:A:C:G:T:vh',\%opts);
prepare_opts();

print_help();

if (scalar(@ARGV) != 3)
{
    print_error("Wrong Number of Parameters (3 is OK)");
}

$abs_file = shift(@ARGV);
$tf_file = shift(@ARGV);
$PID = shift(@ARGV);
$OUTPUT_FILE ="/tmp/CONSTRUCTOR_$PID.gff";


### STEP 1. LOADING TFBSs AND TFs
print_mess("Step 1. Loading sites and tfs");

my ($line,@record);
my ($factor,$motif,$species);
my $i;

my %SITES;
my %SITESGFF;

my %TFS;
my @TFID;

open(SITES,$abs_file) or die (print_error("Can't open $abs_file"));
while($line=<SITES>)
{
    @record = split(/\s+/,$line);

    $factor = $record[2];
    $motif = $record[10];
    $species = $record[9];
    $species =~ s/\://g;
    
    if ($species eq $SPECIES)
    {
	push(@{$SITES{$factor}},$motif);
	chomp($line);
	push(@{$SITESGFF{$factor}},$line);
    }
}
close(SITES);

open(FACTORS,$tf_file) or die (print_error("Can't open $tf_file"));
$i = 0;
while($line=<FACTORS>)
{
    chomp($line); 
    $TFS{$line} = 1; 
    $TFID[$i] = $line;
    $i++;
}
close(FACTORS);


### STEP 2. Generate the sequences: fragment by fragment
print_mess("Step 2. Producing the sequences with the planted motifs");

my @SEQS;
my @FINAL_ABS;
my @FINAL;
my $j;
my $to_plant;
my $key;
my $position;
my $locus;

# initialize seed
srand( time() ^ ($$ + ($$ << 15)) );

# Initialize sequences
for($i=0; $i<$NSEQS; $i++)
{
    $SEQS[$i]="";
}

# Open OUTPUT_FILE to write the planted sites for a picture
open(OUT,">$OUTPUT_FILE") or die (print_error("Can't open $OUTPUT_FILE"));

# Starting the production
for($i=0; $i<$LENGTH; $i=$i+$LFRAGMENT)
{
    # Randomly Select a TF
    $factor = int(rand scalar(@TFID)); 

    print_mess("TF selected ($factor): ",$TFID[$factor]);

    # Plant a site of this TF in the sequences
    for($j=0; $j<$NSEQS; $j++)
    {
	$locus = "SEQUENCE_".($j+1);
	# To plant or not to plant
	$to_plant = rand(1);

	# Plant a motif of this TF in this sequence
	if ($to_plant < $PPLANT && defined($SITES{$TFID[$factor]}))
	{
	    # Randomly select the motif for this TF from this species to be planted
	    $key = int(rand scalar(@{$SITES{$TFID[$factor]}}));
	    $motif = ${$SITES{$TFID[$factor]}}[$key];

	    # record the motif to display afterwards
	    push(@FINAL_ABS,${$SITESGFF{$TFID[$factor]}}[$key]);
	    
	    # Randomly select the position to be planted on current fragment
	    $position = int(rand($LFRAGMENT-length($motif)));
         
            # record the motif in the generated sequence (positions +1)
            push(@FINAL,join("\t",$locus,$GAME,$TFID[$factor],$i+$position+1,$i+$position+length($motif)-1+1,".",".",".","# $SPECIES: $motif"));
            print OUT join("\t",$locus,$GAME,$TFID[$factor],$i+$position+1,$i+$position+length($motif)-1+1,".",".",".","# $SPECIES: $motif\n");

	    print_mess("PLANT motif <$key, $motif> in sequence ", $j+1 ," at position ", 
		       $i+$position+1);
	    
	    # Fill with the BG before the motif
	    $SEQS[$j] = generateBG($SEQS[$j],$position); 
	    
	    # Plant the motif
	    $SEQS[$j] = $SEQS[$j].$motif;

	    # Fill with the BG after the motif
	    $SEQS[$j] = generateBG($SEQS[$j],$LFRAGMENT-($position+length($motif)-1+1)); 
	}
        else
	{
	    # generate the whole fragment in background mode
	    print_mess("NO PLANTING motif <$i> in sequence ",$j+1);
	    $SEQS[$j] = generateBG($SEQS[$j],$LFRAGMENT);

	    # record the no-planted motif
	    push(@FINAL_ABS,$NO_MOTIF);
	   
	}
    }
}

close(OUT);


### STEP 3. Display results
print_mess("Step 3. Displaying results");

my @seq;

# DISPLAY sequences with planted motifs
print "<BR><BR><B><FONT COLOR=blue>Data set of artificial sequences:</FONT></B><BR><BR>";
print "<TABLE><TR><TD><PRE>";

for($i=0; $i<$NSEQS; $i++)
{
    print "\>SEQUENCE_",$i+1,"<BR>";
    
    @seq = split(//,$SEQS[$i]);

    for ($j=0; $j < $LENGTH; $j++)
    {
	print $seq[$j];
	if (($j+1) % $LFASTA == 0)
	{
	    print "<BR>";
	}
    }

    print "<BR>";
}

print "</PRE></TD></TABLE>";
print "<BR><BR>";

# DISPLAY planted sites info
print "<B><FONT COLOR=blue>Planted motifs in the data set:</FONT></B><BR><BR>";
print "<TABLE BORDER=0 BORDERCOLOR=red><TR><TD><PRE>";

while (@FINAL) 
{
    $line = shift(@FINAL);
    @record = split(/\s+/,$line);
    printf("%s\t%s\t%s\t%d\t%d\t.\t.\t.\t%s %s %s\n",$record[0],$record[1],$record[2],$record[3],$record[4],$record[8],$record[9],$record[10]);
}

print "</PRE></TD></TABLE>";

print "<BR><BR>";


# DISPLAY planted sites info in ABS
print "<B><FONT COLOR=blue>Planted motifs annotations in ABS:</FONT></B><BR><BR>";
print "<TABLE BORDER=1 BORDERCOLOR=red><TR><TD>";

$j = 1;
while (@FINAL_ABS) 
{
    print "<B><FONT COLOR=blue>MOTIF $j:</FONT></B><BR><BR>";
    print "<TABLE BORDER=0>";

    for($i=0; $i<$NSEQS; $i++)
    {
	$line = shift(@FINAL_ABS);
	@record = split(/\s+/,$line);
	print "<TR><TD><PRE>SEQUENCE_",$i+1,"</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[0]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[1]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[2]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[3]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[4]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[5]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[6]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[7]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[8]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[9]</PRE></TD><TD>&nbsp;</TD><TD><PRE>$record[10]</PRE></TD><TD>&nbsp;</TD>";
    }
    
    print "</TABLE><BR><BR>";

    $j++;
}

print "</TD></TABLE>";

print "<BR><P>";



#######################################
### FUNCTIONS
#######################################

sub print_error
{
    my @mess = @_;

    print STDERR "!ERROR> CONSTRUCTOR\t",@mess,"\n";
    exit(0);
}

sub print_mess
{
    my @mess = @_;

    if (defined($opts{v}))
    {
	print STDERR ">",@mess,"\n";
    }
}

sub print_help
{
    if (defined($opts{h}))
    {
	print "#### Constructor of benchmarks 1.0\n";
	print "##################################\n";
	print "-L <integer>      length of sequences\n";
	print "-N <integer>      number of sequences\n";
	print "-M <integer>      number of planted motifs\n";
	print "-P [0..1]        probability to plant a motif\n";
	print "-A [0..1]        background A content\n";
	print "-C [0..1]        background C content\n";
	print "-G [0..1]        background G content\n";
	print "-T [0..1]        background T content\n";
	print "-S <HS,MM,RN,GG>  species\n\n";

	exit(0);
    }
}

sub prepare_opts
{
    # default values
    $LENGTH = 500;
    $NSEQS = 10;
    $NMOTIFS = 5;
    $PPLANT = 0.75;
    $SPECIES = "HS";

    $ACONTENT = 0.25;
    $CCONTENT = 0.25;
    $GCONTENT = 0.25;
    $TCONTENT = 0.25;

    if (defined($opts{L}))
    {
	$LENGTH = $opts{L};
    }
    if (defined($opts{N}))
    {
	$NSEQS = $opts{N};
    }
    if (defined($opts{M}))
    {
	$NMOTIFS = $opts{M};
    }
    if (defined($opts{P}))
    {
	$PPLANT = $opts{P};
    }
    if (defined($opts{S}))
    {
	$SPECIES = $opts{S};
    }
    if (defined($opts{A}))
    {
	$ACONTENT = $opts{A};
    }
    if (defined($opts{C}))
    {
	$CCONTENT = $opts{C};
    }
    if (defined($opts{G}))
    {
	$GCONTENT = $opts{G};
    }
    if (defined($opts{T}))
    {
	$TCONTENT = $opts{T};
    }


    # Restriction about density of elements
    $LFRAGMENT = $LENGTH/$NMOTIFS;
    if ($LFRAGMENT < $MIN_DISTANCE)
    {
	print_error("Too many sites for this length ($MIN_DISTANCE)");
    }
}

sub generateBG
{
    my $seq = shift(@_);
    my $N = shift(@_);

    my $i;
    my $GC;
    my $value;

    for($i=0; $i<$N; $i++)
    {
	# GC or AT rich
	$GC = rand(1);

	# C or G
	if ($GC < ($CCONTENT + $GCONTENT))
	{
	    $value = rand($CCONTENT + $GCONTENT);
	    if ($value < $CCONTENT)
	    {
		$seq = $seq.$C;
	    }
	    else
	    {
		$seq = $seq.$G;
	    }
	}
	else
	{
	    # A or T
	    $value = rand($ACONTENT + $TCONTENT);
	    if ($value < $ACONTENT)
	    {
		$seq = $seq.$A;
	    }
	    else
	    {
		$seq = $seq.$T;
	    } 
	}
    }

    return($seq);
}
