#!/usr/local/bin/perl ##---------------------------------------------------------------------------## ## File: ## @(#) combineRMFiles ## Author: ## Robert Hubley ## Description: ## Combine several *.out *.align RepeatMasker annotation files ## #****************************************************************************** #* This software is provided ``AS IS'' and any express or implied * #* warranties, including, but not limited to, the implied warranties of * #* merchantability and fitness for a particular purpose, are disclaimed. * #* In no event shall the authors or the Institute for Systems Biology * #* liable for any direct, indirect, incidental, special, exemplary, or * #* consequential damages (including, but not limited to, procurement of * #* substitute goods or services; loss of use, data, or profits; or * #* business interruption) however caused and on any theory of liability, * #* whether in contract, strict liability, or tort (including negligence * #* or otherwise) arising in any way out of the use of this software, even * #* if advised of the possibility of such damage. * #* * #****************************************************************************** =head1 NAME combineRMFiles - Combine several *.out, *.align RepeatMasker files =head1 SYNOPSIS combineRMFiles.pl e.g ./combinedRMFiles.pl foo bar final combines foo.out and foo.align with bar.out and bar.align to create final.out and final.align =head1 DESCRIPTION Combines two sets of RepeatMasker *.out and *.align files into one set. The assumption is that the entries in each do not overlap because the second set was generated by pre-masking the genome with the first set. IDs are fixed and the results are placed in seq/pos sorted order. =head1 SEE ALSO =head1 COPYRIGHT Copyright 2023 Robert Hubley, Institute for Systems Biology =head1 AUTHOR Robert Hubley =cut # # Module Dependence # use strict; use FindBin; use lib $FindBin::Bin; use lib "$FindBin::Bin/.."; use FileHandle; # This module has the file parser for crossmatch search # result formats (like RM) use CrossmatchSearchEngine; # A module/object to hold a single *.out line or *.align record use SearchResult; # A module/object to hold a collection of SearchResults # ( e.g. a whole *.out file ) use SearchResultCollection; use Data::Dumper; sub usage { exec "pod2text $0"; exit; } # Simplistic argument parsing for a one-off script if ( ! @ARGV ) { usage(); } my $file1 = $ARGV[0]; my $file2 = $ARGV[1]; my $final = $ARGV[2]; if ( ! (( -e "$file1.out" || -l "$file1.out") && (-e "$file1.align" || -l "$file1.align")) ){ die "Could not find $file1.out or $file1.align check that *both* exist!\n"; } if ( ! (( -e "$file2.out" || -l "$file2.out") && (-e "$file2.align" || -l "$file2.align")) ){ die "Could not find $file2.out or $file2.align check that *both* exist!\n"; } # Open file1.out my $IN = new FileHandle; open $IN, "<$file1.out" or die "Could not open $file1.out!\n"; # Read entire file into SearchResult objects contained in a # SearchResultCollection print "Reading $file1.out...\n"; my $outfileSearchResCol = CrossmatchSearchEngine::parseOutput( searchOutput => $IN ); close $IN; # Open file2.out my $IN = new FileHandle; open $IN, "<$file2.out" or die "Could not open $file2.out!\n"; # Read entire file into SearchResult objects contained in a # SearchResultCollection print "Reading $file2.out...\n"; my $tmpResCol = CrossmatchSearchEngine::parseOutput( searchOutput => $IN ); close $IN; # Go through file2 out lines and transform numeric IDs by appending "t" to them. # e.g "1" -> "t1", "2" -> "t2"... for ( my $i = $tmpResCol->size()-1; $i >= 0; $i-- ) { my $result = $tmpResCol->get($i); # Insert removal operations here if filtering my $ID = $result->getId(); $result->setId("t".$ID); } # Now that we have two result collections each with a unique set of IDs we can # safely combine the into one collection. $outfileSearchResCol->addAll($tmpResCol); # Sort combined the result collection by chr, start, and longer end position $outfileSearchResCol->sort( sub ($$) { ( ( $_[ 0 ]->getQueryName() cmp $_[ 1 ]->getQueryName() ) || ( $_[ 0 ]->getQueryStart() <=> $_[ 1 ]->getQueryStart() ) || ( $_[ 1 ]->getQueryEnd() <=> $_[ 0 ]->getQueryEnd() ) ); } ); # Renumber IDs in combined outfile and keep old/new ID translations for # later align file fixing. my %IDTransTable = (); my %file1AlignTransTable = (); my %file2AlignTransTable = (); my $IDIdx = 0; for ( my $i = 0; $i < $outfileSearchResCol->size(); $i++ ) { my $result = $outfileSearchResCol->get($i); my $ID = $result->getId(); my $origID = $ID; # Have we already seen this original ID? if ( exists $IDTransTable{$ID} ){ # Yes...use the previous translation $ID = $IDTransTable{$ID}; }else { # No...increment the new ID index and use that $IDIdx++; $IDTransTable{$ID} = $IDIdx; $ID = $IDIdx; } # Save the new ID in the record $result->setId($ID); # Was this record from file1 or file2? if ( $origID =~ /t(\d+)/ ) { # "t" prefix indicates file2. Save translation for fixing file2align $file2AlignTransTable{$1} = $ID; }else { # file1 record. Save translation for fixing file1align $file1AlignTransTable{$origID} = $ID; } } print "Writing combined *.out...\n"; # write out combined *.out file with new IDs open OUT,">$final.out" or die; for ( my $i = 0; $i < $outfileSearchResCol->size(); $i++ ) { my $result = $outfileSearchResCol->get($i); print OUT "" . $result->toStringFormatted( SearchResult::OutFileFormat ); } close OUT; $outfileSearchResCol = undef; ## ### Now let's combine the *.align files ## # Open file1.align $IN = new FileHandle; open $IN, "<$file1.align" or die "Could not open $file1.align!\n"; # Read entire file into SearchResult objects contained in a # SearchResultCollection my $alignfileSearchResCol = CrossmatchSearchEngine::parseOutput( searchOutput => $IN ); close $IN; for ( my $i = 0; $i < $alignfileSearchResCol->size(); $i++ ) { my $result = $alignfileSearchResCol->get($i); # Insert removal operations here if filtering my $ID = $result->getId(); # Translate the IDs to match the new combined *.out my $newID = $file1AlignTransTable{$ID}; $result->setId($newID); } # Open file2.align $IN = new FileHandle; open $IN, "<$file2.align" or die "Could not open $file2.align!\n"; # Read entire file into SearchResult objects contained in a # SearchResultCollection my $tmpResCol = CrossmatchSearchEngine::parseOutput( searchOutput => $IN ); close $IN; for ( my $i = 0; $i < $tmpResCol->size(); $i++ ) { my $result = $tmpResCol->get($i); # Insert removal operations here if filtering my $ID = $result->getId(); # Translate the IDs to match the new combined *.out my $newID = $file2AlignTransTable{$ID}; $result->setId($newID); } $alignfileSearchResCol->addAll($tmpResCol); # Sort the result collection $alignfileSearchResCol->sort( sub ($$) { ( ( $_[ 0 ]->getQueryName() cmp $_[ 1 ]->getQueryName() ) || ( $_[ 0 ]->getQueryStart() <=> $_[ 1 ]->getQueryStart() ) || ( $_[ 1 ]->getQueryEnd() <=> $_[ 0 ]->getQueryEnd() ) ); } ); # Finally save the results to the combined *.align open OUT,">$final.align" or die; for ( my $i = 0; $i < $alignfileSearchResCol->size(); $i++ ) { my $result = $alignfileSearchResCol->get($i); print OUT "" . $result->toStringFormatted( SearchResult::N_AlignWithQuerySeq ) . "\n"; } close OUT; $alignfileSearchResCol = undef; $tmpResCol = undef; 1;