#!/usr/local/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      @(#) combineRMFiles
##  Author:
##      Robert Hubley
##  Description:
##      Combine several *.out *.align RepeatMasker annotation files
##
#******************************************************************************
#*  This software is provided ``AS IS'' and any express or implied            *
#*  warranties, including, but not limited to, the implied warranties of      *
#*  merchantability and fitness for a particular purpose, are disclaimed.     *
#*  In no event shall the authors or the Institute for Systems Biology        *
#*  liable for any direct, indirect, incidental, special, exemplary, or       *
#*  consequential damages (including, but not limited to, procurement of      *
#*  substitute goods or services; loss of use, data, or profits; or           *
#*  business interruption) however caused and on any theory of liability,     *
#*  whether in contract, strict liability, or tort (including negligence      *
#*  or otherwise) arising in any way out of the use of this software, even    *
#*  if advised of the possibility of such damage.                             *
#*                                                                            *
#******************************************************************************

=head1 NAME

combineRMFiles - Combine several *.out, *.align RepeatMasker files

=head1 SYNOPSIS

  combineRMFiles.pl <file1Prefix)> <file2Prefix> <newFilesPrefix>

  e.g ./combinedRMFiles.pl foo bar final
            combines foo.out and foo.align with bar.out and bar.align
            to create final.out and final.align

=head1 DESCRIPTION
  Combines two sets of RepeatMasker *.out and *.align files into one
  set.  The assumption is that the entries in each do not overlap because
  the second set was generated by pre-masking the genome with the first set.
  IDs are fixed and the results are placed in seq/pos sorted order.
 
=head1 SEE ALSO

=head1 COPYRIGHT

Copyright 2023 Robert Hubley, Institute for Systems Biology

=head1 AUTHOR

Robert Hubley <rhubley@systemsbiology.org>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::Bin;
use lib "$FindBin::Bin/..";
use FileHandle;

# This module has the file parser for crossmatch search
# result formats (like RM)
use CrossmatchSearchEngine;
# A module/object to hold a single *.out line or *.align record
use SearchResult;
# A module/object to hold a collection of SearchResults 
# ( e.g. a whole *.out file )
use SearchResultCollection;
use Data::Dumper;


sub usage {
  exec "pod2text $0";
  exit;
}

# Simplistic argument parsing for a one-off script
if ( ! @ARGV )
{
 usage();
}

my $file1 = $ARGV[0];
my $file2 = $ARGV[1];
my $final = $ARGV[2];

if ( ! (( -e "$file1.out" || -l "$file1.out") && (-e "$file1.align" || -l "$file1.align")) ){
  die "Could not find $file1.out or $file1.align check that *both* exist!\n";
}
if ( ! (( -e "$file2.out" || -l "$file2.out") && (-e "$file2.align" || -l "$file2.align")) ){
  die "Could not find $file2.out or $file2.align check that *both* exist!\n";
}

# Open file1.out
my $IN = new FileHandle;
open $IN, "<$file1.out" or 
    die "Could not open $file1.out!\n";

# Read entire file into SearchResult objects contained in a 
# SearchResultCollection
print "Reading $file1.out...\n";
my $outfileSearchResCol = CrossmatchSearchEngine::parseOutput( searchOutput => $IN );
close $IN;

# Open file2.out
my $IN = new FileHandle;
open $IN, "<$file2.out" or 
    die "Could not open $file2.out!\n";

# Read entire file into SearchResult objects contained in a 
# SearchResultCollection
print "Reading $file2.out...\n";
my $tmpResCol = CrossmatchSearchEngine::parseOutput( searchOutput => $IN );
close $IN;

# Go through file2 out lines and transform numeric IDs by appending "t" to them.
# e.g "1" -> "t1", "2" -> "t2"...
for ( my $i = $tmpResCol->size()-1; $i >= 0; $i-- )
{
  my $result = $tmpResCol->get($i);
  # Insert removal operations here if filtering
  my $ID = $result->getId();
  $result->setId("t".$ID);
}

# Now that we have two result collections each with a unique set of IDs we can 
# safely combine the into one collection.
$outfileSearchResCol->addAll($tmpResCol);
 

# Sort combined the result collection by chr, start, and longer end position
$outfileSearchResCol->sort(
      sub ($$) {
        (    ( $_[ 0 ]->getQueryName() cmp $_[ 1 ]->getQueryName() )
          || ( $_[ 0 ]->getQueryStart() <=> $_[ 1 ]->getQueryStart() )
          || ( $_[ 1 ]->getQueryEnd() <=> $_[ 0 ]->getQueryEnd() ) );
      }
    );

# Renumber IDs in combined outfile and keep old/new ID translations for
# later align file fixing.
my %IDTransTable = (); 
my %file1AlignTransTable = ();
my %file2AlignTransTable = ();
my $IDIdx = 0;
for ( my $i = 0; $i < $outfileSearchResCol->size(); $i++ )
{
  my $result = $outfileSearchResCol->get($i);
  my $ID = $result->getId();
  my $origID = $ID;

  # Have we already seen this original ID?
  if ( exists $IDTransTable{$ID} ){
    # Yes...use the previous translation
    $ID = $IDTransTable{$ID};
  }else {
    # No...increment the new ID index and use that
    $IDIdx++;
    $IDTransTable{$ID} = $IDIdx;
    $ID = $IDIdx;
  }  
  # Save the new ID in the record 
  $result->setId($ID);

  # Was this record from file1 or file2?
  if ( $origID =~ /t(\d+)/ ) {
    # "t" prefix indicates file2.  Save translation for fixing file2align
    $file2AlignTransTable{$1} = $ID;
  }else {
    # file1 record.  Save translation for fixing file1align
    $file1AlignTransTable{$origID} = $ID;
  }
}

print "Writing combined *.out...\n";
# write out combined *.out file with new IDs
open OUT,">$final.out" or die;
for ( my $i = 0; $i < $outfileSearchResCol->size(); $i++ )
{
  my $result = $outfileSearchResCol->get($i);
  print OUT ""  . $result->toStringFormatted( SearchResult::OutFileFormat );
}
close OUT;
$outfileSearchResCol = undef;


##
### Now let's combine the *.align files
##
# Open file1.align
$IN = new FileHandle;
open $IN, "<$file1.align" or 
    die "Could not open $file1.align!\n";

# Read entire file into SearchResult objects contained in a 
# SearchResultCollection
my $alignfileSearchResCol = CrossmatchSearchEngine::parseOutput( searchOutput => $IN );
close $IN;
for ( my $i = 0; $i < $alignfileSearchResCol->size(); $i++ )
{
  my $result = $alignfileSearchResCol->get($i);
  # Insert removal operations here if filtering
  my $ID = $result->getId();
  # Translate the IDs to match the new combined *.out
  my $newID = $file1AlignTransTable{$ID};
  $result->setId($newID);
}

# Open file2.align
$IN = new FileHandle;
open $IN, "<$file2.align" or 
    die "Could not open $file2.align!\n";

# Read entire file into SearchResult objects contained in a 
# SearchResultCollection
my $tmpResCol = CrossmatchSearchEngine::parseOutput( searchOutput => $IN );
close $IN;
for ( my $i = 0; $i < $tmpResCol->size(); $i++ )
{
  my $result = $tmpResCol->get($i);
  # Insert removal operations here if filtering
  my $ID = $result->getId();
  # Translate the IDs to match the new combined *.out
  my $newID = $file2AlignTransTable{$ID};
  $result->setId($newID);
}
$alignfileSearchResCol->addAll($tmpResCol);

# Sort the result collection
$alignfileSearchResCol->sort(
      sub ($$) {
        (    ( $_[ 0 ]->getQueryName() cmp $_[ 1 ]->getQueryName() )
          || ( $_[ 0 ]->getQueryStart() <=> $_[ 1 ]->getQueryStart() )
          || ( $_[ 1 ]->getQueryEnd() <=> $_[ 0 ]->getQueryEnd() ) );
      }
    );


# Finally save the results to the combined *.align
open OUT,">$final.align" or die;
for ( my $i = 0; $i < $alignfileSearchResCol->size(); $i++ )
{
  my $result = $alignfileSearchResCol->get($i);
  print OUT ""  . $result->toStringFormatted( SearchResult::N_AlignWithQuerySeq ) . "\n";
}
close OUT;
$alignfileSearchResCol = undef;
$tmpResCol = undef;


 
1;
