# Demotic parser for H2 hmmsearch output
#
# Does not parse alignment section yet. But suitable for profmark use.
# H2's hmmsearch can only search a single query model, not multiples.
#
# SRE, Fri Apr 16 11:07:34 2010

package demotic_h2;

sub parse (*) {
    my $fh = shift;
    my $parsing_header  = 1;
    my $parsing_seqs    = 0;
    my $parsing_domains = 0;
    my $parsing_alis    = 0;

    # Initialize everything... so we can call the parser
    # repeatedly, one call per hmmsearch output.
    #
    # This section also documents what's available in
    # variables within the package.
    # 
    # Arrays are indexed [0..nhits-1] or [0..nali-1]
    #
    $queryname      = "";	# Name of the query sequence
    $querydesc      = "";	# Description line for the query (or "")
    $querylen       = 0;	# Length of the query in residues
    $db             = "";	# Name of the target database file
    $db_nseq        = 0;	# Number of sequences in the target database
    $db_nletters    = "";	# Number of residues in the target database
                                # (stored as a string so we can have large #'s)

				# The top hit list (still in rank order)
    $nhits          = 0;	# Number of entries in the hit list
    @hit_target     = ();	# Target sequence name (by rank)
    %target_desc    = ();	# Target sequence description (by targ name)
    @hit_bitscore   = ();	# Raw score (by rank)
    @hit_Eval       = ();	# P-val or E-val (by rank)

				# The alignment output (in order it came in)
				# all indexed by # of alignment [0..nali-1]
    $nali           = 0;	# Number of alignments
    @ali_target     = ();	# Target sequence name
    @ali_bitscore   = ();	# bit score
    @ali_evalue     = ();	# E-value
    @ali_qstart     = ();	# Start position on query
    @ali_qend       = ();	# End position on query
    @ali_tstart     = ();	# Start position on target
    @ali_tend       = ();	# End position on target
    @ali_qali       = ();       # Aligned string from query
    @ali_tali       = ();       # Aligned string from target (subject)

    # Now, loop over the lines of our input, and parse 'em.
    #
    while (<$fh>) {

	if ($parsing_header) {
	    if (/^Scores for complete sequences /) { # seq section starts with this line
		$parsing_header  = 0;
		$parsing_seqs    = 1;
		<$fh>;		# This discards the next line:   Sequence              Description                       Score    E-value  N
		<$fh>;		# and discard another line:      --------              -----------                       -----    ------- ---
		next;
	    } 
	    elsif (/Query HMM:\s*(\S+)/)    { $queryname = $1; }
	    elsif (/Description:\s*(\.*)$/) { $querydesc = $1; }
	}

	elsif ($parsing_seqs) {
	    if (/^\s*$/) { 	# seq section ends with a blank line
		$parsing_seqs    = 0;
		$parsing_domains = 1;
		<$fh>;   # Discard:   Parsed for domains:
		<$fh>;   # Discard:   Sequence              Domain  seq-f seq-t    hmm-f hmm-t      score  E-value
		<$fh>;   # Discard:   --------              ------- ----- -----    ----- -----      -----  -------
		next;
	    } 
	    elsif (/^(\S+)\s+(.+)\s+(-?\d+\.\d*)\s+(\S+)\s+\d+\s*$/) {
		#        Target   Description  Score Evalue   N
                #           
		$hit_target[$nhits]    = $1;
		$target_desc{$1}       = $2;
		$hit_bitscore[$nhits]  = $3;
		$hit_Eval[$nhits]      = $4;
		$nhits++;
	    }
	}

	elsif ($parsing_domains) {
	    if (/^\s*$/) { 	# domain section ends with a blank line
		$parsing_domains    = 0;
		$parsing_alis       = 1;
		<$fh>;  # Discard: Alignments of top-scoring domains:
	    }

	    if (/^(\S+)\s+\d+\/\d+\s+(\d+)\s+(\d+)\s+\S\S\s+(\d+)\s+(\d+)\s+\S\S\s+(-?\d+\.\d*)\s+(\S+)\s*$/)
               #  Sequence  Domain   seq-f   seq-t           hmm-f  hmm-t            score      E-value
               #     1                2        3              4       5                6           7
	    {
		$ali_target[$nali]   = $1;
		$ali_bitscore[$nali] = $6;
		$ali_evalue[$nali]   = $7;
		$ali_qstart[$nali]   = $4;
		$ali_qend[$nali]     = $5;
		$ali_tstart[$nali]   = $2;
		$ali_tend[$nali]     = $3;
		$nali++;
	    }
	}
    }

    if ($parsing_alis) { return 1; } else { return 0;  }
}

sub exblxout {
    my $ofh     = shift;
    my $i;
    
    for ($i = 0; $i <= $nali-1; $i++) {
	printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n",
	$ali_evalue[$i],
	$ali_identity[$i],
	$ali_tstart[$i],
	$ali_tend[$i],
	$ali_target[$i],
	$ali_qstart[$i],
	$ali_qend[$i],
	$queryname;
    }
    
}


sub tblout {
    my $ofh     = shift;
    my $i;
    
    for ($i = 0; $i <= $nali-1; $i++) {
	printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n",
	$ali_evalue[$i],
	$ali_identity[$i],
	$ali_qstart[$i],
	$ali_qend[$i],
	$queryname,
	$ali_tstart[$i],
	$ali_tend[$i],
	$ali_target[$i],
	$target_desc{$ali_target[$i]};
    }
}    


sub gffout {
    my $ofh     = shift;
    my $source  = shift;
    my $feature = shift;
    my $i;
    my $strand;
    
    for ($i = 0; $i <= $nali-1; $i++) {
	if ($ali_qstart[$i] > $ali_qend[$i]) { $strand = "-"; }
	else { $strand = "+"; } 

	printf $ofh "%s\t%s\t%s\t%d\t%d\t%.1f\t%s\t.\tgene \"%s\"\n",
	$ali_target[$i],
	$source,
	$feature,
	$ali_tstart[$i],
	$ali_tend[$i],
	$ali_bitscore[$i],
	$strand,
	$queryname;
    }
}


sub profmarkout {
    my $ofh = shift;
    my $i;

    for ($i = 0; $i < $nhits; $i++) {
	printf $ofh "%g\t%.1f\t%s\t%s\n", $hit_Eval[$i], $hit_bitscore[$i], $hit_target[$i], $queryname;
    }
}
1;
__END__

  |more

