
    XfIA                         d Z ddlZddlZddlmZ ddlmZ  ej        de            G d d          Z G d	 d
          Z	 G d de	          Z
 G d de	          ZdS )a  Provide trainers which estimate parameters based on training sequences.

These should be used to 'train' a Markov Model prior to actually using
it to decode state paths. When supplied training sequences and a model
to work from, these classes will estimate parameters of the model.

This aims to estimate two parameters:

- a_{kl} -- the number of times there is a transition from k to l in the
  training data.
- e_{k}(b) -- the number of emissions of the state b from the letter k
  in the training data.

    N)BiopythonDeprecationWarning   )ScaledDPAlgorithmszThe 'Bio.HMM.Trainer' module is deprecated and will be removed in a future release of Biopython. Consider using the hmmlearn package instead.c                       e Zd ZdZd ZdS )TrainingSequencezEHold a training sequence with emissions and optionally, a state path.c                     t          |          dk    r/t          |          t          |          k    rt          d          || _        || _        dS )a  Initialize a training sequence.

        Arguments:
         - emissions - An iterable (e.g., a tuple, list, or Seq object)
           containing the sequence of emissions in the training sequence.
         - state_path - An iterable (e.g., a tuple or list) containing the
           sequence of states. If there is no known state path, then the
           sequence of states should be an empty iterable.

        r   z/State path does not match associated emissions.N)len
ValueError	emissionsstates)selfr   
state_paths      /lib/python3.11/site-packages/Bio/HMM/Trainer.py__init__zTrainingSequence.__init__)   sL     z??Q3y>>S__#D#DNOOO"     N)__name__
__module____qualname____doc__r    r   r   r   r   &   s)        OO! ! ! ! !r   r   c                   *    e Zd ZdZd Zd Zd Zd ZdS )AbstractTrainerz5Provide generic functionality needed in all trainers.c                     || _         dS zInitialize the class.N)_markov_modelr   markov_models     r   r   zAbstractTrainer.__init__=   s    )r   c                 B    d}|D ]}|t          j        |          z  }|S )zCalculate the log likelihood of the training seqs.

        Arguments:
         - probabilities -- A list of the probabilities of each training
           sequence under the current parameters, calculated using the
           forward algorithm.

        r   )mathlog)r   probabilitiestotal_likelihoodprobabilitys       r   log_likelihoodzAbstractTrainer.log_likelihoodA   s7     ( 	6 	6K 5 55r   c                 ^    |                      |          }|                      |          }||fS )a  Get a maximum likelihood estimation of transition and emission.

        Arguments:
         - transition_counts -- A dictionary with the total number of counts
           of transitions between two states.
         - emissions_counts -- A dictionary with the total number of counts
           of emissions of a particular emission letter by a state letter.

        This then returns the maximum likelihood estimators for the
        transitions and emissions, estimated by formulas 3.18 in
        Durbin et al::

            a_{kl} = A_{kl} / sum(A_{kl'})
            e_{k}(b) = E_{k}(b) / sum(E_{k}(b'))

        Returns:
        Transition and emission dictionaries containing the maximum
        likelihood estimators.

        )ml_estimator)r   transition_countsemission_countsml_transitionsml_emissionss        r   estimate_paramszAbstractTrainer.estimate_paramsP   s6    , **+<==((99|++r   c                    t          |          }i }d}d}|D ]}|d         |k    r|d         }||         }|                    |          dz   }|t          |          k     rY||         d         |d         k    rA||||                  z  }|dz  }|t          |          k     r||         d         |d         k    An	 ||         |z  }|||<   |S )a@  Calculate the maximum likelihood estimator.

        This can calculate maximum likelihoods for both transitions
        and emissions.

        Arguments:
         - counts -- A dictionary of the counts for each item.

        See estimate_params for a description of the formula used for
        calculation.

        Nr   r   )sortedindexr	   )	r   countsall_orderedml_estimation
cur_lettercur_letter_countscur_itemcur_positioncur_mls	            r   r&   zAbstractTrainer.ml_estimatork   s    Vnn 
# 	- 	-H{j((%a[
 %+8$4!  +00::Q>
 !3{#3#333#L1!4CC%L0I)JJ% A%L	 !3{#3#333#L1!4CC  H%(99F&,M(##r   N)r   r   r   r   r   r$   r+   r&   r   r   r   r   r   :   sV        ??* * *     , , ,62 2 2 2 2r   r   c                   .    e Zd ZdZd ZefdZd Zd ZdS )BaumWelchTrainera`  Trainer that uses the Baum-Welch algorithm to estimate parameters.

    These should be used when a training sequence for an HMM has unknown
    paths for the actual states, and you need to make an estimation of the
    model parameters from the observed emissions.

    This uses the Baum-Welch algorithm, first described in
    Baum, L.E. 1972. Inequalities. 3:1-8
    This is based on the description in 'Biological Sequence Analysis' by
    Durbin et al. in section 3.3

    This algorithm is guaranteed to converge to a local maximum, but not
    necessarily to the global maxima, so use with care!
    c                 <    t                               | |           dS )zInitialize the trainer.

        Arguments:
         - markov_model - The model we are going to estimate parameters for.
           This should have the parameters with some initial estimates, that
           we can build from.

        Nr   r   r   s     r   r   zBaumWelchTrainer.__init__   s      	  |44444r   c                    d}d}	 | j                                         }| j                                         }g }|D ]}	 || j         |	          }
|
                                \  }}|
                                }|                    |           |                     ||	|||          }|                     ||	|||          }|                     ||          \  }}|| j         _	        || j         _
        |                     |          }|9t          t          |          t          |          z
            } |||          rn	|}|dz  }F| j         S )a  Estimate the parameters using training sequences.

        The algorithm for this is taken from Durbin et al. p64, so this
        is a good place to go for a reference on what is going on.

        Arguments:
         - training_seqs -- A list of TrainingSequence objects to be used
           for estimating the parameters.
         - stopping_criteria -- A function, that when passed the change
           in log likelihood and threshold, will indicate if we should stop
           the estimation iterations.
         - dp_method -- A class instance specifying the dynamic programming
           implementation we should use to calculate the forward and
           backward variables. By default, we use the scaling method.

        Nr   )r   get_blank_transitionsget_blank_emissionsforward_algorithmbackward_algorithmappendupdate_transitionsupdate_emissionsr+   transition_probemission_probr$   abs)r   training_seqsstopping_criteria	dp_methodprev_log_likelihoodnum_iterationstransition_countemission_countall_probabilitiestraining_seqDPforward_varseq_probbackward_varr)   r*   cur_log_likelihoodlog_likelihood_changes                     r   trainzBaumWelchTrainer.train   s   " #1	 #1GGII!/CCEEN !# -  Yt1<@@(*(<(<(>(>%X!4466!((222 $(#:#:$lKx$ $  "&!6!6"L+|X" "
 ,0+?+? ., ,(NL 2@D./;D,!%!4!45F!G!G #. ),*++c2E.F.FF) )% %$%:NKK  #5aNc1	 f !!r   c                    | j         j        }| j         j        }| j         j        D ]}| j                             |          D ]}	d}
t          t          |j                  dz
            D ]I}|||f         }||	|dz   f         }|||	f         }||	|j        |dz            f         }|
||z  |z  |z  z  }
J|||	fxx         |
|z  z  cc<   |S )a  Add the contribution of a new training sequence to the transitions.

        Arguments:
         - transition_counts -- A dictionary of the current counts for the
           transitions
         - training_seq -- The training sequence we are working with
         - forward_vars -- Probabilities calculated using the forward
           algorithm.
         - backward_vars -- Probabilities calculated using the backwards
           algorithm.
         - training_seq_prob - The probability of the current sequence.

        This calculates A_{kl} (the estimated transition counts from state
        k to state l) using formula 3.20 in Durbin et al.

        r   r   )r   rC   rD   state_alphabettransitions_fromranger	   r   )r   r'   rN   forward_varsbackward_varstraining_seq_probtransitionsr   klestimated_countsiforward_valuebackward_valuetrans_value	emm_values                   r   rA   z#BaumWelchTrainer.update_transitions  s+   2 (8&4	 #2 	R 	RA'88;; R R#$ s<#9::Q>??  A$0!Q$8M &3Aq1u:%>N #.q!f"5K !*1l.DQU.K*L MI$%3i?.P$$
 "1a&)))-=@Q-QQ))))+R. ! r   c                 
   | j         j        D ]u}| j         j        D ]f}d}t          t	          |j                            D ]+}	|j        |	         |k    r||||	f         |||	f         z  z  },|||fxx         ||z  z  cc<   gv|S )a  Add the contribution of a new training sequence to the emissions.

        Arguments:
         - emission_counts -- A dictionary of the current counts for the
           emissions
         - training_seq -- The training sequence we are working with
         - forward_vars -- Probabilities calculated using the forward
           algorithm.
         - backward_vars -- Probabilities calculated using the backwards
           algorithm.
         - training_seq_prob - The probability of the current sequence.

        This calculates E_{k}(b) (the estimated emission probability for
        emission letter b from state k) using formula 3.21 in Durbin et al.

        r   )r   rW   emission_alphabetrY   r	   r   )
r   r(   rN   rZ   r[   r\   r^   bexpected_timesra   s
             r   rB   z!BaumWelchTrainer.update_emissions;  s    2 #2 	N 	NA'9 N N!"s<#9::;; W WA $-a0A55&,1v*>PQSTvAV*VV  A'''><M+MM''''N r   N)	r   r   r   r   r   r   rU   rA   rB   r   r   r   r8   r8      sl         	5 	5 	5 AS G" G" G" G"R5! 5! 5!n( ( ( ( (r   r8   c                   *    e Zd ZdZd Zd Zd Zd ZdS )KnownStateTrainerzEstimate probabilities with known state sequences.

    This should be used for direct estimation of emission and transition
    probabilities when both the state path and emission sequence are
    known for the training examples.
    c                 <    t                               | |           dS r   r:   r   s     r   r   zKnownStateTrainer.__init__n  s      |44444r   c                 B   | j                                         }| j                                         }|D ]3}|                     ||          }|                     |j        |          }4|                     ||          \  }}|| j         _        || j         _        | j         S )ay  Estimate the Markov Model parameters with known state paths.

        This trainer requires that both the state and the emissions are
        known for all of the training sequences in the list of
        TrainingSequence objects.
        This training will then count all of the transitions and emissions,
        and use this to estimate the parameters of the model.
        )	r   r<   r=   _count_emissions_count_transitionsr   r+   rC   rD   )r   rF   r'   r(   rN   r)   r*   s          r   rU   zKnownStateTrainer.trainr  s     !.DDFF,@@BB) 	 	L"33L/RRO $ 7 7#%6! !
 (,';';(
 (
$ .<*+7(!!r   c           	          t          t          |j                            D ]S}|j        |         }|j        |         }	 |||fxx         dz  cc<   0# t          $ r t	          d| d| d          w xY w|S )a  Add emissions from the training sequence to the current counts (PRIVATE).

        Arguments:
         - training_seq -- A TrainingSequence with states and emissions
           to get the counts from
         - emission_counts -- The current emission counts to add to.

        r   zUnexpected emission (, ))rY   r	   r   r   KeyError)r   rN   r(   r.   	cur_statecur_emissions         r   rn   z"KnownStateTrainer._count_emissions  s     3|56677 	U 	UE$+E2I'1%8LUL 9:::a?:::: U U USySSLSSSTTTUs   A!A3c           	          t          t          |          dz
            D ]L}||         }||dz            }	 |||fxx         dz  cc<   )# t          $ r t          d| d| d          w xY w|S )a  Add transitions from the training sequence to the current counts (PRIVATE).

        Arguments:
         - state_seq -- A Seq object with the states of the current training
           sequence.
         - transition_counts -- The current transition counts to add to.

        r   zUnexpected transition (rq   rr   )rY   r	   rs   )r   	state_seqr'   cur_posrt   
next_states         r   ro   z$KnownStateTrainer._count_transitions  s     S^^a/00 	U 	UG!'*I"7Q;/JU!9j"9:::a?:::: U U USSSjSSSTTTU ! s   A		!A*N)r   r   r   r   r   rU   rn   ro   r   r   r   rk   rk   f  sZ         5 5 5" " "8  &! ! ! ! !r   rk   )r   r   warningsBior   DynamicProgrammingr   warnr   r   r8   rk   r   r   r   <module>r~      s1      + + + + + + 2 2 2 2 2 2   	  ! ! ! ! ! ! ! !(c c c c c c c cLC C C C C C C CLM! M! M! M! M! M! M! M! M! M!r   