=head1 NAME

iPE::Estimator - The main workhorse object of iPE.

=head1 DESCRIPTION

This object handles the dispatching of input files and actions to the objects which estimate the parameters and output parameters in iParameterEstimation.

=head1 FUNCTIONS

=over 8

=cut

package iPE::Estimator;

use iPE;
use iPE::Globals;
use iPE::gHMM;
use iPE::Model::BNTree;
use iPE::Levels;
use iPE::Sequence::Region;
use iPE::FeatureMap;
use iPE::Annotation;
use iPE::Util::DNATools;
use iPE::XML::Wrappers;
use File::Basename;

our $null_bases = 0;

use strict;

sub ANALYZE     { 0 }
sub COUNT       { 1 }
sub NORMALIZE   { 2 }
sub SCORE       { 3 }

=item new (href)

Creates a new estimator object.  Requires a file which describes the model (parameter file or XML parameter template), a feature mapping and a set of input sequences.  These are described in the href passed in as follows:

=over 8

=item gHMMFile

The name of the file containing the gHMM definition.

=item featureMapFiles

An array reference to names of files created.

=item annotationFiles

Array reference of annotations to use for estimating parameters.

=item dnaFiles

Array reference of DNA files to use for training in FASTA format.

=item consFiles

Array reference of conservation sequence files for training in FASTA format.

=back

=cut

sub new {
	my ($class, $m) = @_;
	
    my $this = bless {}, $class;

    foreach my $member (keys %$m) {
        $this->{$member."_"} = $m->{$member};
    }

    #XXX: this should be _init method
    Msg("[ Reading $this->{gHMMFile_} ... ");

    if(defined $this->{gHMMFile_}) {
        #Assuming XML file--checking in gHMM...this okay? (I think so.)
        $this->{gHMM_} = new iPE::gHMM ();
        $this->{gHMM_}->parse_file($this->{gHMMFile_}, "gHMM", 
            "parameter_file", "parameter_file.dtd");
    }
    else { die ("new ".__PACKAGE__." with no gHMM definition.\n"); }

    # in the future, 
    # zoe file can be read here _in addition to_ the parameter template.
    # the parts of the file can be inferred from the names of the model, which
    # are unique.  The file can then initialize the parameters, which can be
    # cleared if options are set to clear and estimate parameters.

    # we can also use this for re-estimation by inputting counts from the zoe
    # file or XML file....


    # set some globals
    my $g = new iPE::Globals();
    $g->gHMM($this->{gHMM_});
    $g->states($this->{gHMM_}->states);

    if( (length($g->options->nscanModel) &&
         length($g->options->nscanTopology)))  {
        $g->bntree(new iPE::Model::BNTree(
            $g->options->nscanTopology,
            $g->options->nscanModel));
    }

    Msg("done ]\n[ Reading feature maps ... ");

    if(defined $this->{featureMapFiles_}) {
        for my $feature_map_filename (@{$this->{featureMapFiles_}}) {
            chomp $feature_map_filename;
            my $feature_map = new iPE::FeatureMap;
            $feature_map->parse_file($feature_map_filename, "feature_map");
            push (@{$this->{featureMaps_}}, $feature_map);
        }
    }
    else { die "new ".__PACKAGE__." with no featureMap list.\n"; }

    $this->{sequences_} = [];

    Msg("done ]\n");

    if(!defined $this->{annotationFiles_}) { 
        die "new ".__PACKAGE__." with no annotation list.\n"; 
    }

    # check for validity
    $this->_validate;

    Msg("Initialization successful.\n\n");

    if($g->options->performScore)      { $this->{mode_} = SCORE      }
    elsif($g->options->performNormalize)  { $this->{mode_} = NORMALIZE  }
    elsif($g->options->performCount)      { $this->{mode_} = COUNT      }
    else                                  { $this->{mode_} = ANALYZE    }

    if(defined($this->{dnaFiles_})) {
        $this->{levels_} = new iPE::Levels(
            {windowSize => $g->options->isochoreWindow});
        $g->levels($this->{levels_});
    }
    else {
        Warn("\nNo DNA sequence defined; cannot estimate durations or ".
             "transitions.\n");
    }

    # add all the comments we're interested in adding
    $this->{gHMM_}->comments->addComment("Input Files:");
    $this->{gHMM_}->comments->addComment("  Instance file:");
    $this->{gHMM_}->comments->addComment("    $this->{instanceFilename_}");
    $this->{gHMM_}->comments->addComment("  gHMM file:");
    $this->{gHMM_}->comments->addComment("    $this->{gHMMFile_}");
    $this->{gHMM_}->comments->addList("  Feature Map files", 
        $this->{featureMapFiles_});
    for my $type (@{$g->seqtypes}) {
        next if (!defined ($this->{$type."Files_"}));
        $this->{gHMM_}->comments->addList("  $type files", 
            $this->{$type."Files_"});
    }
    $this->{gHMM_}->comments->addList("  Annotation files", 
        $this->{annotationFiles_});
    $this->{gHMM_}->comments->addComment("");

    # initialize the random seed
    $this->{randomSeed_} = $g->options->randomSeed();
    $this->{randomSeed_} = time() if($g->options->randomSeed() eq "time");


	$this->{completed_} = \&new;   #keeps track of the last step in estimation
	$this->{nextStep_} = \&_step;  #keeps track of the next step in estimation

    return $this;
}

sub _init {
#FIXME: most of the code from new should be here.
}

sub _validate {
    my ($this) = @_;

    my $g = new iPE::Globals();
    # check to see if there are models that have no corresponding sequences
    for my $seqtype (@{$g->seqtypes}) {
        if(scalar(@{$this->gHMM->getEmissions($seqtype)}) != 0 &&
                !defined($this->{$seqtype."Files_"})) {
            die "gHMM file has emissions of type $seqtype, but no sequences ".
                "of this type were input.\n";
        }
    }
}

sub run {
    my ($this) = @_;

    srand($this->{randomSeed_});

    if($this->mode == NORMALIZE)        { $this->normalize  }
    elsif($this->mode == SCORE)         { $this->score      }
    else                                { $this->step       }
}

=item outputGhmmToFile (filename, format, mode)

Outputs the gHMM in its current state to a file.  If the filename is the empty string, no file is output.  Acceptable formats are zoe and xml.

=cut
sub outputGhmmToFile {
    my ($this, $filename, $format, $mode) = @_;

    my $g = new iPE::Globals();
    if(length($filename) > 0) {
        Msg("[ Outputting $filename ... ");
        my $out = new iPE::Util::Output();
        $this->{gHMM_}->outputPrepare($out, $mode);
        my $prefix = $g->options->outputBaseDir."/";

        if(open FH, ">", $prefix.$filename) {
            $out->setFH(\*FH);
            for($format) {
                if(/^zoe$/)    { $this->{gHMM_}->outputZoe($out, $mode) }
                elsif(/^xml$/) { $this->{gHMM_}->outputXML($out, $mode) }
            }
            close FH;
        }
        else {
            Warn("WARNING: Could not open $filename for writing.\n");
        }
        Msg("done ]\n");
    }
}

sub _getSequenceReaderClassname {
    my ($ext) = @_;
    my $sr_class;
    my $g = new iPE::Globals();
    if($g->options->loadSequences) {
        $sr_class = $g->sr_prefix("load").$g->seqreader($ext);
        eval "use $sr_class";
        if($@) {
            $sr_class = $g->sr_prefix("noload").$g->seqreader($ext);
            eval "use $sr_class";
            if($@) { die "Error: cannot read seq of type $ext\n$@"; }
        }
    }
    else {
        $sr_class = $g->sr_prefix("noload").$g->seqreader($ext);
        eval "use $sr_class";
        if($@) {
            $sr_class = $g->sr_prefix("load").$g->seqreader($ext);
            eval "use $sr_class";
            if($@) { die "Error: cannot read seq of type $ext\n$@"; }
        }
    }
    return $sr_class;
}

#item cleanup
#
#releases all the memory taken up by annotations and sequences

sub cleanup {
    my ($this) = @_;

    $this->{sequences_} = [];
    $this->{annotation_} = undef;  # clear up memory
}

#item next
#
# This function advances the estimator to the next tuple of annotations and sequences.  It cleans up the memory for the old sequences, reads in the new sequences, and finally converts the annotations in to our format.
#
#cut
sub next {
    my ($this) = @_;

    my $g = new iPE::Globals();
    my $dnaseq = undef;
    my $length = undef;
    # for each of the sequence types
    for my $type (@{$g->seqtypes}) {
        my $seq = undef;
        # check if the files are defined
        if(defined $this->{$type."Files_"}) {
            # if that didn't work, go to the next file
            my $seqfile = shift @{$this->{$type."Files_"}};

            if(!defined ($seqfile)) {
                die "Got fewer sequences for type $type. Quitting.\n"
                    if($type ne $g->seqtypes->[0]);
                return 0;
            }

            my ($pre,$file,$ext) = fileparse($seqfile,qr{\.[^.]*});
            $ext =~ s/\.//;
            if(!defined($g->seqreader($ext))) {
                die "Filename extension $ext not supported.";
            }
            
            my $sr_class = _getSequenceReaderClassname($ext);

            Msg("[ Reading $seqfile ... ");

            my $sr = $sr_class->new({filename => $seqfile}); 
            $seq = $g->seqclass($type)->new({sr => $sr});

            if(!defined ($seq)) {
                die "Empty sequence file $seqfile.\n";
            }

            Msg("done ]\n");

            $dnaseq = $seq if($type eq "dna");

            if (!defined $length) {
                $length = $seq->length;
                $this->{curLength_} = $length;
            }
            die "Sequence in file ".$seqfile." is of different length ".
                $seq->length."\n" if($length != $seq->length);

            # handle the special case with nscan alignment sequences
            if($seq->type eq "malign") {
                if(!defined $g->ssSeqNames) {
                    $g->ssSeqNames($seq->seqNames);
                }
                if(scalar(@{$g->ssSeqNames()}) != $g->bntree->numNodes+1) {
                    Warn("Number of phylo tree nodes ".$this->numPhyloTreeNodes.
                      " differs from number of sequences in\n$seqfile.  This ".
                      " might cause an error in the optimization step.\n");
                }
                if(scalar(@{$g->ssSeqNames()}) != $seq->numSeqs) {
                    die "Sequence in file ".$seqfile." has a different ".
                        "number of sequences.\n";
                }
                for(my $i = 0; $i < scalar(@{$seq->seqNames}); $i++) {
                    if($seq->seqNames->[$i] ne $g->ssSeqNames->[$i]) {
                        die "Alignment in file ".$seqfile." has different ".
                            "sequence names in the header.\n";
                    }
                }
            }

            push (@{$this->{sequences_}}, $seq);
        }
    }

    if(defined $g->levels) {
        Msg("[ Getting G+C% levels for DNA ... ");
        $g->levels->setLevels( $dnaseq );
        Msg("done ]\n");
    }

    my $annfile = shift @{$this->{annotationFiles_}};
    die "Fewer annotations than sequences\n" if(!defined $annfile);

    Msg("[ Converting $annfile ... ");

    $this->{annotation_} = new iPE::Annotation({
        filename => $annfile, 
        featureMaps => $this->featureMaps,
        nullDefinitions => $this->gHMM->nullDefinitions,
        seqLength => $this->{curLength_},
        levels => $this->{gcLevels_},
        altspliceStates => [ values (%{$this->gHMM->altspliceStates}) ] 
        });

    #output the annotations to the debug message file.
    msg($this->{annotation_}->format);

    if(length($g->options->blacklistOutputFile) > 0) {
        my $blacklistFile = 
            $g->options->outputBaseDir."/".$g->options->blacklistOutputFile;
        if(open FH, ">$blacklistFile") {
            print FH "$annfile:\n";
            print FH "$_\n" for (@{$this->{annotation_}->blacklist});
            close FH;
        }
        else {
            Msg "WARNING: Could no open file $blacklistFile for writing.\n";
        }
    }

    Msg("done ]\n");

    return 1;
}

sub gHMM        { shift->{gHMM_} }
sub featureMaps { shift->{featureMaps_} }
sub mode        { shift->{mode_}        }

sub sequences   { shift->{sequences_} }
sub annotation  { shift->{annotation_} }

sub step  { shift->_get_to(\&_step); }
sub normalize { shift->_get_to(\&_normalize); } 
sub score { shift->_get_to(\&_score); }
sub optimize { shift->_get_to(\&_optimize); }

# -- private functions

# _get_to (goalsub)
#
# Runs the current routine (step, normalize, or score) until the desired routine is reached.  These are advanced in the routines themselves _step, _normalize, and _score.
sub _get_to {
	my ($this, $goal) = @_;

	while($this->{completed_} ne $goal) {
		die "Undefined routine in get to" if (!defined $this->{nextStep_});
		&{$this->{nextStep_}}($this);
	}
}

# _update_state (completed, nextStep)
#
# Updates the state of the Estimator.  Sets the next step to the second argument, sets the completed step to first argument.  Should be called in all routines along the chain of estimation
sub _update_state {
	my $this = shift;
	$this->{completed_} = shift;
	$this->{nextStep_} = shift;
}



sub _step {
    my $this = shift;

    my ($featureDir, $modelDir);

    my $g = new iPE::Globals();
    # check to see if we have anything to do at all.
    if($g->options->featureOutputDir =~ /\w/) {
        $featureDir = $g->options->outputBaseDir."/".
            $g->options->featureOutputDir;
        if (-d $featureDir) {
            system("rm -r $featureDir");
        }
        mkdir $featureDir or die "Could not create directory $featureDir\n";
    }
    else { $featureDir = 0 }

    if($g->options->modelOutputDir =~ /\w/) {
        $modelDir = $g->options->outputBaseDir."/".$g->options->modelOutputDir;
        if (-d $modelDir) {
            system("rm -r $modelDir");
        }
        mkdir $modelDir or die "Could not create directory $modelDir\n";
    }
    else { $modelDir = 0 }

    unless($this->mode >= COUNT || $featureDir || $modelDir) {
        die __PACKAGE__.": I have nothing to do.  Specify that you wish ".
            "to count, normalize, score or output data.\n";
    }

    while($this->next) {
        #do all the special outputting 
        if($featureDir || $modelDir) {
            Msg("[ Outputting fasta files ... ");
            my $thresh = $this->_progress(0, -1, $this->{curLength_});
            for my $segAnn (@{$this->annotation->segmentedAnnotations}) {
                for my $transcript (@{$segAnn->transcripts}) {
                    for my $feature (@{$transcript->features}) {
                        $thresh = $this->_progress($feature->start, $thresh, 
                            $this->{curLength_});
                        if($featureDir) {
                            for my $sequence (@{$this->sequences}) {
                                my $dir = $featureDir."/".$sequence->type;
                                my $filename = $feature->state->name.".fa";
                                unless(-d $dir) {
                                    mkdir $dir or die 
                                        "Could not create $dir.\n";
                                }
                                open FH, ">>$dir/$filename" or die 
                                    "could not open $filename for appending.\n";
                                print FH ">".$transcript->id."\n";
                                $sequence->writeFeatureToFH(\*FH, $feature);
                                close FH;
                            }
                        }
                        if($modelDir) {
                            for my $sequence(@{$this->sequences}) {
                                my $dir = $modelDir."/".$sequence->type;
                                unless(-d $dir) {
                                    mkdir $dir or die 
                                        "Could not create $dir.\n";
                                }
                                my $chain = 
                                    $feature->state->getModels($sequence->type);
                                $chain->outputModels($dir, $sequence, $feature,
                                    $transcript->id);
                            }
                        }
                    }
                }
            }
            $this->_progress(-1);
        }
        if($this->mode >= COUNT) {
            $this->_count;
        }
        $this->cleanup;
    }


    if($this->mode >= COUNT) { 
        Msg("Counting complete.\n\n\n");

        $this->outputGhmmToFile($g->options->countOutputFile, "xml", "count");

        Msg("[ Smoothing counts ... ");
        $this->gHMM->smooth;
        Msg("done ]\n");

        msg("Total NULL BASES: $null_bases\n");
        $this->outputGhmmToFile($g->options->smoothedCountOutputFile, "xml", 
            "count");
    }

    $this->gHMM->comments->addComment("Completed on ".scalar(localtime(time())));

	$this->_update_state(\&_step, \&_normalize);
}

# _normalize ()
#
# Normalize all the models into probabilities, but not scores.
sub _normalize {
    my $this = shift;

    # We normalize the totals to get a vague idea of the base composition of
    # all the sequence.  This is used for a null model against the 
    # bases that are given in the positive level
    my $g = new iPE::Globals();
    Msg("[ Maximizing parameters ... ");
    $g->levels->normalizeTotals;

    $this->gHMM->normalize;
    Msg("done ]\n");

    $this->outputGhmmToFile($g->options->probOutputFile, "xml", "prob");

    $this->_update_state(\&_normalize, \&_score);
}

# _score ()
#
# Convert all the models to log-odds scores, incorporating the null model.
sub _score {
    my $this = shift;

    $this->gHMM->score;

    my $g = new iPE::Globals();
    $this->outputGhmmToFile($g->options->xmlOutputFile, "xml", "score");
    $this->outputGhmmToFile($g->options->zoeOutputFile, "zoe", "score");

	$this->_update_state(\&_score, undef);
}

# _count ()
#
# Count all the models.
sub _count {
    my $this = shift;

    die "Fewer sequences than annotations\n" 
        if (scalar(@{$this->sequences}) == 0);

    my $g = new iPE::Globals();
    if(defined($g->levels)) {
        Msg("[ Counting durations and transitions ... ");
        for my $segAnn (@{$this->annotation->segmentedAnnotations}) {
            for my $transcript (@{$segAnn->transcripts}) {
                # no weighting
                my $prev_feature = undef;
                for my $feature (@{$transcript->features}) {
                    #transition model
                      # here we could add a weight to the transition, 
                      # but we don't
                    if(defined($prev_feature) &&
                            $prev_feature->end == $feature->start-1) {
                        msg("transition from ".$prev_feature->state->name." to ".
                            $feature->state->name."\n");
                        $prev_feature->state->transitions->count($feature, 1) 
                    }
                    $prev_feature = $feature;

                    # duration distribution
                      # here we could weight the distributions, but the memory
                      # blows up.  (Using floats for counts is expensive.)
                    msg("Counting state ".$feature->state->name."'s duration\n");
                    if(defined $feature->state->durModel) {
                       $feature->state->durModel->countFeature($feature, 1);
                    }
                }
            }
        }
        Msg("done ]\n");
    }

    Msg("[ Counting sequence models ... ");
    my $thresh = $this->_progress(0, -1, $this->{curLength_});
    for my $segAnn (@{$this->annotation->segmentedAnnotations}) {
        for my $segment (@{$segAnn->segments}) {
            $thresh = 
                $this->_progress($segment->start, $thresh, $this->{curLength_});
            $this->_count_segment($segment);
        }
    }
    $this->_progress(-1);
    Msg("\n");
}

sub _count_segment {
    my ($this, $segment) = @_;

    # the weight of the counts is based on the number of valid layers
    # in the current segment.  That is if the picture looks like this
    # EEEEEEEEE
    # ---------
    # IIIIIIIII
    # Then each layer gets .5 weight for each segment.
    # This currently recounts layers which are the same.

    msg("("); msg($_->id.", ") for (@{$segment->transcripts}); msg(")\n");

    my $g = new iPE::Globals();
    my $weight = 1;
    return if($segment->validLayers == 0);
    if($g->options->weightCounts) { $weight = 1/$segment->validLayers; }

    for my $feature (@{$segment->features}) {
        next if (!defined $feature);

        msg("Counting ".$feature->state->name."\n") ;

        # Here we iterate over all different kinds of sequences
        # that have been declared to the estimator.
        # (DNA, ConSeq, ESTSeq, ...)
        for my $seq (@{$this->sequences}) {

            my ($start, $end, $parentStart, $parentEnd);

            if($feature->strand eq '-') { 
                ($start, $end) = 
                    rcCoords($segment->start, $segment->end, $seq->length);
                ($parentStart, $parentEnd) = 
                    rcCoords($feature->start, $feature->end, $seq->length);
            }
            else {  
                ($start, $end) = ($segment->start, $segment->end);
                ($parentStart, $parentEnd) = ($feature->start, $feature->end);
            }

            my $members = { parentStart => $parentStart,
                            parentEnd   => $parentEnd,
                            start       => $start,
                            end         => $end,
                            seq         => $seq,
                            feature     => $feature,
                            weight      => $weight          
                          };

            my $region = new iPE::Sequence::Region($members);

            msg("WEIGHT: ".$region->weight."\n") if($region->weight < 1.);

            # submit region
            $feature->state->countRegion($region);

            # if this feature has a null region, 
            # submit it to the gHMM.
            for my $nullRegion (@{$feature->nullRegions}) {
                next if($region->seq->type ne $nullRegion->seqtype);
                my ($nullStart, $nullEnd);
                if($feature->strand eq '-') {
                    ($nullStart, $nullEnd) =
                        rcCoords($nullRegion->start, $nullRegion->end,
                            $seq->length);
                }
                else {
                    ($nullStart, $nullEnd) = 
                        ($nullRegion->start, $nullRegion->end)
                }
                my $null = $region->subregion($nullStart, $nullEnd);
                $this->gHMM->countNullRegion($null) if(defined($null));

                #debugging
                $null_bases += ($null->end - $null->start + 1)*$null->weight
                    if (defined($null));
            }
        }
    }

}

sub _progress {
    my ($this, $val, $thresh, $total) = @_;

    if($val == 0 && $thresh < 0) {
        Msg("  0%");
        $thresh = $total/100;
    }
    elsif($val == -1) {
        Msg("\b\b\b\bdone ]\n");
    }
    elsif($val > $thresh) {
        Msg(sprintf("\b\b\b\b%3d%%", 100*($val/$total)));
        $thresh = $val + $total/100;
    }

    return $thresh;
}

=back

=head1 SEE ALSO

L<iPE>

=head1 AUTHOR

Bob Zimmermann (rpz@cse.wustl.edu)

=cut

1;
