
    c%                    ,   d Z ddlZddlZddlZddlZddlmZ ddlZddl	m
Z
mZ ddl	mZ ddlmZmZmZ ddlmZmZmZmZmZmZmZ ddlmZmZ dd	lmZ  ej        e          Z d
 Z! G d dej"                  Z# G d dej$        ej%                  Z&dS )a  Optimized `Latent Dirichlet Allocation (LDA) <https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>`_ in Python.

For a faster implementation of LDA (parallelized for multicore machines), see also :mod:`gensim.models.ldamulticore`.

This module allows both LDA model estimation from a training corpus and inference of topic
distribution on new, unseen documents. The model can also be updated with new documents
for online training.

The core estimation code is based on the `onlineldavb.py script
<https://github.com/blei-lab/onlineldavb/blob/master/onlineldavb.py>`_, by
Matthew D. Hoffman, David M. Blei, Francis Bach:
`'Online Learning for Latent Dirichlet Allocation', NIPS 2010`_.

.. _'Online Learning for Latent Dirichlet Allocation', NIPS 2010: online-lda_
.. _'Online Learning for LDA' by Hoffman et al.: online-lda_
.. _online-lda: https://papers.neurips.cc/paper/2010/file/71f6278d140af599e06ad9bf1ba03cb0-Paper.pdf

The algorithm:

#. Is **streamed**: training documents may come in sequentially, no random access required.
#. Runs in **constant memory** w.r.t. the number of documents: size of the training corpus does not affect memory
   footprint, can process corpora larger than RAM.
#. Is **distributed**: makes use of a cluster of machines, if available, to speed up model estimation.


Usage examples
--------------

Train an LDA model using a Gensim corpus

.. sourcecode:: pycon

    >>> from gensim.test.utils import common_texts
    >>> from gensim.corpora.dictionary import Dictionary
    >>>
    >>> # Create a corpus from a list of texts
    >>> common_dictionary = Dictionary(common_texts)
    >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    >>>
    >>> # Train the model on the corpus.
    >>> lda = LdaModel(common_corpus, num_topics=10)

Save a model to disk, or reload a pre-trained model

.. sourcecode:: pycon

    >>> from gensim.test.utils import datapath
    >>>
    >>> # Save model to disk.
    >>> temp_file = datapath("model")
    >>> lda.save(temp_file)
    >>>
    >>> # Load a potentially pretrained model from disk.
    >>> lda = LdaModel.load(temp_file)

Query, the model using new, unseen documents

.. sourcecode:: pycon

    >>> # Create a new corpus, made of previously unseen documents.
    >>> other_texts = [
    ...     ['computer', 'time', 'graph'],
    ...     ['survey', 'response', 'eps'],
    ...     ['human', 'system', 'computer']
    ... ]
    >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
    >>>
    >>> unseen_doc = other_corpus[0]
    >>> vector = lda[unseen_doc]  # get topic probability distribution for a document

Update the model by incrementally training on the new corpus

.. sourcecode:: pycon

    >>> lda.update(other_corpus)
    >>> vector = lda[unseen_doc]

A lot of parameters can be tuned to optimize training for your specific case

.. sourcecode:: pycon

    >>> lda = LdaModel(common_corpus, num_topics=50, alpha='auto', eval_every=5)  # learn asymmetric alpha from data

    N)defaultdict)gammalnpsi)	polygamma)
interfacesutilsmatutils)kullback_leibler	hellingerjaccard_distancejensen_shannondirichlet_expectation	logsumexpmean_absolute_difference)	basemodelCoherenceModel)Callbackc                    |t          t          j        |                     t          |           z
  |z   z  }|t          dt          j        |                     z  }| t          d|           z  }t          j        ||z            d|z  t          j        d|z            z   z  }||z
   |z  }||z  | z   }	t	          |	dk              r|	} nt
                              d           | S )a  Update a given prior using Newton's method, described in
    `J. Huang: "Maximum Likelihood Estimation of Dirichlet Distribution Parameters"
    <http://jonathan-huang.org/research/dirichlet/dirichlet.pdf>`_.

    Parameters
    ----------
    prior : list of float
        The prior for each possible outcome at the previous iteration (to be updated).
    N : int
        Number of observations.
    logphat : list of float
        Log probabilities for the current estimation, also called "observed sufficient statistics".
    rho : float
        Learning rate.

    Returns
    -------
    list of float
        The updated prior.

       r   zupdated prior is not positive)r   npsumr   allloggerwarning)
priorNlogphatrhogradfcqbdpriorupdated_priors
             6lib/python3.11/site-packages/gensim/models/ldamodel.pyupdate_dir_priorr&   r   s    , RVE]]##c%jj07:;E	Ia'''A	
Yq%   A
uqyQURVAE]]23Aqy\AF&L5(M
=1 86777L    c                   n     e Zd ZdZej        fdZd Zd ZddZ	ddZ
d Zd	 Ze fd
            Z xZS )LdaStatezEncapsulate information for distributed computation of :class:`~gensim.models.ldamodel.LdaModel` objects.

    Objects of this class are sent over the network, so try to keep them lean to
    reduce traffic.

    c                     |                     |d          | _        t          j        ||          | _        d| _        || _        dS )ai  

        Parameters
        ----------
        eta : numpy.ndarray
            The prior probabilities assigned to each term.
        shape : tuple of (int, int)
            Shape of the sufficient statistics: (number of topics to be found, number of terms in the vocabulary).
        dtype : type
            Overrides the numpy array default types.

        Fcopydtyper   N)astypeetar   zerossstatsnumdocsr.   )selfr0   shaper.   s       r%   __init__zLdaState.__init__   sC     ::e%:00huE222


r'   c                 ,    d| j         dd<   d| _        dS )zBPrepare the state for a new EM iteration (reset sufficient stats).        Nr   r2   r3   r4   s    r%   resetzLdaState.reset   s    AAAr'   c                 b    |J | xj         |j         z  c_         | xj        |j        z  c_        dS )a  Merge the result of an E step from one node with that of another node (summing up sufficient statistics).

        The merging is trivial and after merging all cluster nodes, we have the
        exact same result as if the computation was run on a single node (no
        approximation).

        Parameters
        ----------
        other : :class:`~gensim.models.ldamodel.LdaState`
            The state object with which the current one will be merged.

        Nr9   )r4   others     r%   mergezLdaState.merge   s8        u|#%r'   Nc                 j   |J || j         }| j         dk    s|| j         k    rd}nd|z  | j         z  }| xj        d|z
  |z  z  c_        |j         dk    s||j         k    rd}n.t                              d|j         |           d|z  |j         z  }| xj        ||z  |j        z  z  c_        || _         dS )aO  Merge the current state with another one using a weighted average for the sufficient statistics.

        The number of documents is stretched in both state objects, so that they are of comparable magnitude.
        This procedure corresponds to the stochastic gradient update from
        `'Online Learning for LDA' by Hoffman et al.`_, see equations (5) and (9).

        Parameters
        ----------
        rhot : float
            Weight of the `other` state in the computed average. A value of 0.0 means that `other`
            is completely ignored. A value of 1.0 means `self` is completely ignored.
        other : :class:`~gensim.models.ldamodel.LdaState`
            The state object with which the current one will be merged.
        targetsize : int, optional
            The number of documents to stretch both states to.

        Nr         ?z>merging changes from %i documents into a model of %i documents)r3   r2   r   info)r4   rhotr=   
targetsizescales        r%   blendzLdaState.blend   s    $     	&J <1 	4
dl : 	4EE*$t|3Ed
e++ =A 	5u}!< 	5EEKKXZ_Zgisttt*$u}4Ete|el22!r'   c                 X    |J || j         }| xj        |j        z  c_        || _         dS )a)  Merge the current state with another one using a weighted sum for the sufficient statistics.

        In contrast to :meth:`~gensim.models.ldamodel.LdaState.blend`, the sufficient statistics are not scaled
        prior to aggregation.

        Parameters
        ----------
        rhot : float
            Unused.
        other : :class:`~gensim.models.ldamodel.LdaState`
            The state object with which the current one will be merged.
        targetsize : int, optional
            The number of documents to stretch both states to.

        N)r3   r2   )r4   rB   r=   rC   s       r%   blend2zLdaState.blend2   s>          	&J 	u|#!r'   c                      | j         | j        z   S )zGet the parameters of the posterior over the topics, also referred to as "the topics".

        Returns
        -------
        numpy.ndarray
            Parameters of the posterior probability over topics.

        )r0   r2   r:   s    r%   
get_lambdazLdaState.get_lambda  s     x$+%%r'   c                 D    t          |                                           S )zGet the log (posterior) probabilities for each topic.

        Returns
        -------
        numpy.ndarray
            Posterior probabilities for each topic.
        )r   rI   r:   s    r%   get_ElogbetazLdaState.get_Elogbeta  s     %T__%6%6777r'   c                      t          t          |           j        |g|R i |}t          |d          s1t          j        |_        t          j        d|j	        j
        |           |S )a  Load a previously stored state from disk.

        Overrides :class:`~gensim.utils.SaveLoad.load` by enforcing the `dtype` parameter
        to ensure backwards compatibility.

        Parameters
        ----------
        fname : str
            Path to file that contains the needed object.
        args : object
            Positional parameters to be propagated to class:`~gensim.utils.SaveLoad.load`
        kwargs : object
            Key-word parameters to be propagated to class:`~gensim.utils.SaveLoad.load`

        Returns
        -------
        :class:`~gensim.models.ldamodel.LdaState`
            The state loaded from the given file.

        r.   :dtype was not set in saved %s file %s, assuming np.float64)superr)   loadhasattrr   float64r.   loggingrA   	__class____name__)clsfnameargskwargsresultrS   s        r%   rO   zLdaState.load  sq    , +x%%*5B4BBB6BB vw'' 	y:FLLUW]WgWprwxxxr'   N)rT   
__module____qualname____doc__r   float32r6   r;   r>   rE   rG   rI   rK   classmethodrO   __classcell__rS   s   @r%   r)   r)      s          *,    $  
& & &"%" %" %" %"N" " " "0	& 	& 	&8 8 8     [    r'   r)   c                   (    e Zd ZdZddddddddddd	d
ddddddddej        fdZd Zd Zd,dZ	d Z
d-dZd,dZd Zd Zd,dZ	 	 	 d.dZd-dZd/dZd0dZd1dZd Zd1dZ	 	 d2d#Z	 	 d3d$Zd,d%Z	 	 d4d'Zd,d(Zd5 fd*	Ze fd+            Z xZS )6LdaModela  Train and use Online Latent Dirichlet Allocation model as presented in
    `'Online Learning for LDA' by Hoffman et al.`_

    Examples
    -------
    Initialize a model using a Gensim corpus

    .. sourcecode:: pycon

        >>> from gensim.test.utils import common_corpus
        >>>
        >>> lda = LdaModel(common_corpus, num_topics=10)

    You can then infer topic distributions on new, unseen documents.

    .. sourcecode:: pycon

        >>> doc_bow = [(1, 0.3), (2, 0.1), (0, 0.09)]
        >>> doc_lda = lda[doc_bow]

    The model can be updated (trained) with new documents.

    .. sourcecode:: pycon

        >>> # In practice (corpus =/= initial training corpus), but we use the same here for simplicity.
        >>> other_corpus = common_corpus
        >>>
        >>> lda.update(other_corpus)

    Model persistency is achieved through :meth:`~gensim.models.ldamodel.LdaModel.load` and
    :meth:`~gensim.models.ldamodel.LdaModel.save` methods.

    Nd   Fi  r   	symmetricg      ?r@   
   2   gMbP?{Gz?c           	      
   t          j        |          j        | _        || _        || j        t	          d          | j        Mt
                              d           t          j        |          | _        t          | j                  | _
        nNt          | j                  dk    r/dt          | j                                                  z   | _
        nd| _
        | j
        dk    rt	          d          t          |          | _        t          |          | _        || _        |
| _        || _        || _        d| _        || _        || _        || _        || _        || _        || _        |                     |d          \  | _        | _        | j        j        | j        fk    s+J dtA          | j        j                  | j        fz              |                     |	d	          \  | _!        | _"        | j!        j        | j
        fk    sS| j!        j        | j        | j
        fk    s7J d
tA          | j!        j                  | j
        | j        | j
        fz              t          j#        |          | _$        || _%        || _&        |s*t
          '                    d           d| _(        d| _)        nk| j        rtU          d          	 ddl+}|i }t          j,        di |5 }ddl-m.} |/                    |0                    |          |                   | _(        t
          1                    dtA          | j(        j2                  z             | j(        3                    | j        | j        |||	d           t          | j(        4                                          | _)        t
          '                    d| j)                   ddd           n# 1 swxY w Y   n?# tj          $ r2}t
          6                    d|           to          d|z            d}~ww xY wtq          | j!        | j        | j
        f| j                  | _9        | j$        :                    dd| j        | j
        f          | j9        j;        d<   t          j<        t{          | j9        j;                            | _>        | j!        j        | j        k    sJ | j>        j        | j        k    sJ |h| j(        du}t          j?                    }| @                    ||           | A                    dd|  dt          j?                    |z
  dd           dS dS ) a  

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
            If you have a CSC in-memory matrix, you can convert it to a
            streamed corpus with the help of gensim.matutils.Sparse2Corpus.
            If not given, the model is left untrained (presumably because you want to call
            :meth:`~gensim.models.ldamodel.LdaModel.update` manually).
        num_topics : int, optional
            The number of requested latent topics to be extracted from the training corpus.
        id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}
            Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
            debugging and topic printing.
        distributed : bool, optional
            Whether distributed computing should be used to accelerate training.
        chunksize :  int, optional
            Number of documents to be used in each training chunk.
        passes : int, optional
            Number of passes through the corpus during training.
        update_every : int, optional
            Number of documents to be iterated through for each update.
            Set to 0 for batch learning, > 1 for online iterative learning.
        alpha : {float, numpy.ndarray of float, list of float, str}, optional
            A-priori belief on document-topic distribution, this can be:
                * scalar for a symmetric prior over document-topic distribution,
                * 1D array of length equal to num_topics to denote an asymmetric user defined prior for each topic.

            Alternatively default prior selecting strategies can be employed by supplying a string:
                * 'symmetric': (default) Uses a fixed symmetric prior of `1.0 / num_topics`,
                * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / (topic_index + sqrt(num_topics))`,
                * 'auto': Learns an asymmetric prior from the corpus (not available if `distributed==True`).
        eta : {float, numpy.ndarray of float, list of float, str}, optional
            A-priori belief on topic-word distribution, this can be:
                * scalar for a symmetric prior over topic-word distribution,
                * 1D array of length equal to num_words to denote an asymmetric user defined prior for each word,
                * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination.

            Alternatively default prior selecting strategies can be employed by supplying a string:
                * 'symmetric': (default) Uses a fixed symmetric prior of `1.0 / num_topics`,
                * 'auto': Learns an asymmetric prior from the corpus.
        decay : float, optional
            A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
            when each new document is examined.
            Corresponds to :math:`\kappa` from `'Online Learning for LDA' by Hoffman et al.`_
        offset : float, optional
            Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
            Corresponds to :math:`\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_
        eval_every : int, optional
            Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
        iterations : int, optional
            Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
        gamma_threshold : float, optional
            Minimum change in the value of the gamma parameters to continue iterating.
        minimum_probability : float, optional
            Topics with a probability lower than this threshold will be filtered out.
        random_state : {np.random.RandomState, int}, optional
            Either a randomState object or a seed to generate one. Useful for reproducibility.
        ns_conf : dict of (str, object), optional
            Key word parameters propagated to :func:`gensim.utils.getNS` to get a Pyro4 nameserver.
            Only used if `distributed` is set to True.
        minimum_phi_value : float, optional
            if `per_word_topics` is True, this represents a lower bound on the term probabilities.
        per_word_topics : bool
            If True, the model also computes a list of topics, sorted in descending order of most likely topics for
            each word, along with their phi values multiplied by the feature length (i.e. word count).
        callbacks : list of :class:`~gensim.models.callbacks.Callback`
            Metric callbacks to log and visualize evaluation metrics of the model during training.
        dtype : {numpy.float16, numpy.float32, numpy.float64}, optional
            Data-type to use during calculations inside model. All inputs are also converted.

        NzYat least one of corpus/id2word must be specified, to establish input space dimensionalityzHno word id mapping provided; initializing from corpus, assuming identityr   r   z6cannot compute LDA over an empty collection (no terms)alphaz6Invalid alpha shape. Got shape %s, but expected (%d, )r0   zAInvalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)z%using serial LDA version on this nodez8auto-optimizing alpha not implemented in distributed LDA)LDA_DISPATCHER_PREFIX)prefixzlooking for dispatcher at %sF)id2word
num_topics	chunksizerj   r0   distributedz)using distributed version with %i workersz)failed to initialize distributed LDA (%s)r-         Y@rh   .)chunks_as_numpycreatedztrained z in z.2fs)msg )Br   finfor.   rm   
ValueErrorr   r   r   dict_from_corpuslen	num_termsmaxkeysboolrp   intrn   ro   decayoffsetminimum_probabilitynum_updatespassesupdate_every
eval_everyminimum_phi_valueper_word_topics	callbacksinit_dir_priorrj   optimize_alphar5   strr0   optimize_etaget_random_staterandom_state
iterationsgamma_thresholdrA   
dispatcher
numworkersNotImplementedErrorPyro4getNSgensim.models.lda_dispatcherrk   Proxylistdebug_pyroUri
initialize
getworkers	ExceptionerrorRuntimeErrorr)   stategammar2   expr   expElogbetatimeupdateadd_lifecycle_event)r4   corpusrn   rm   rp   ro   r   r   rj   r0   r   r   r   r   r   r   r   ns_confr   r   r   r.   r   nsrk   err	use_numpystarts                               r%   r6   zLdaModel.__init__^  s   ^ Xe__*
  	dl 	k   < 	NNefff 1&99DL ..DNN" 	T\%6%6%8%8!9!99DNNDN>Q 	WUVVV,,j//"
#6 ($!2."*.*=*=eW*M*M'
D'zDO#55 	p 	pDDJL\H]H]_c_nGoo	p 	p 	p '+&9&9#u&E&E#$#x~$.!22 	TdhnZ^ZhHi6i 	T 	TO  $.$/4>RS	T 	T 	T "2<@@ %.  	VKK?@@@"DODOO" f)*deeeV ! G[++7++ 	^rRRRRRR&+kk"''AV'2W2WXm2n&o&oDOLL!?#doF^B_B_!_```O.. $T]#% /    '*$/*D*D*F*F&G&GDOKK KT_]]]	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^  V V VH#NNN"#NQT#TUUUV
 dh$.(IQUQ[\\\
!%!2!8!8y4?\`\jJk!l!l
#6"7
8I"J"JKK x~++++%3333  	t3IIKKEKK	K:::$$CtCCu)<CCCC %     		 	s=   	O "C$OO OO OO 
P(-PPc                 4    d|dk    r j         n|dk    r j        nt          d          d}t          t                    rvdk    r[t
                              d|d j         z             t          j         fd	t                    D              j
        
          }ndk    r|dk    rt          d          t          j        fdt                    D              j
        
          }||                                z  }t
                              d|t          |                     nXdk    rgd}t          j         fdt                    D              j
        
          }|dk    r)t
                              d|t          |                     nt          d|dd          t          t                    rt          j         j
                  }nt          t          j                  r                     j
        d          }nmt          t          j        t"          j        f          r5t          j        fdt                    D              j
                  }nt          d|z            ||fS )aQ  Initialize priors for the Dirichlet distribution.

        Parameters
        ----------
        prior : {float, numpy.ndarray of float, list of float, str}
            A-priori belief on document-topic distribution. If `name` == 'alpha', then the prior can be:
                * scalar for a symmetric prior over document-topic distribution,
                * 1D array of length equal to num_topics to denote an asymmetric user defined prior for each topic.

            Alternatively default prior selecting strategies can be employed by supplying a string:
                * 'symmetric': (default) Uses a fixed symmetric prior of `1.0 / num_topics`,
                * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / (topic_index + sqrt(num_topics))`,
                * 'auto': Learns an asymmetric prior from the corpus (not available if `distributed==True`).

            A-priori belief on topic-word distribution. If `name` == 'eta' then the prior can be:
                * scalar for a symmetric prior over topic-word distribution,
                * 1D array of length equal to num_words to denote an asymmetric user defined prior for each word,
                * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination.

            Alternatively default prior selecting strategies can be employed by supplying a string:
                * 'symmetric': (default) Uses a fixed symmetric prior of `1.0 / num_topics`,
                * 'auto': Learns an asymmetric prior from the corpus.
        name : {'alpha', 'eta'}
            Whether the `prior` is parameterized by the alpha vector (1 parameter per topic)
            or by the eta (1 parameter per unique term in the vocabulary).

        Returns
        -------
        init_prior: numpy.ndarray
            Initialized Dirichlet prior:
            If 'alpha' was provided as `name` the shape is (self.num_topics, ).
            If 'eta' was provided as `name` the shape is (len(self.id2word), ).
        is_auto: bool
            Flag that shows if hyperparameter optimization should be used or not.
        Nre   rj   r0   z'name' must be 'alpha' or 'eta'Fzusing symmetric %s at %sr@   c              3   ,   K   | ]}d j         z  V  dS r@   Nrn   .0ir4   s     r%   	<genexpr>z*LdaModel.init_dir_prior.<locals>.<genexpr>C  s*      GGqS4?*GGGGGGr'   r.   count
asymmetricz.The 'asymmetric' option cannot be used for etac              3   L   K   | ]}d |t          j                  z   z  V  dS r   )r   sqrt)r   r   prior_shapes     r%   r   z*LdaModel.init_dir_prior.<locals>.<genexpr>J  s7      RR!SA 4 445RRRRRRr'   zusing asymmetric %s %sautoTc              3   ,   K   | ]}d j         z  V  dS r   r   r   s     r%   r   z*LdaModel.init_dir_prior.<locals>.<genexpr>Q  s*      )\)\A#*?)\)\)\)\)\)\r'   z$using autotuned %s, starting with %szUnable to determine proper z value given ''r-   r+   c              3      K   | ]}V  d S rZ   rv   )r   r   r   s     r%   r   z*LdaModel.init_dir_prior.<locals>.<genexpr>\  s#      %H%He%H%H%H%H%H%Hr'   zC%s must be either a np array of scalars, list of scalars, or scalar)rn   r{   rx   
isinstancer   r   rA   r   fromiterranger.   r   r   asarrayndarrayr/   numbernumbersReal)r4   r   nameis_auto
init_priorr   s   ``   @r%   r   zLdaModel.init_dir_prior  s   H  	 E7? 	@/KKU] 	@.KK>???eS!! 	k# c6cDO>STTT[GGGGE+4F4FGGG*K  

 ,& c5= W$%UVVV[RRRRu[?Q?QRRR*K  
 jnn...
4dD<L<LMMMM& c[)\)\)\)\{I[I[)\)\)\*K9 9 9
7? `KK FdS]N^N^___ jUYUYUY[`[`[`!abbbt$$ 	kE<<<JJrz** 	kdju==JJ	7<899 	k%H%H%H%HU;5G5G%H%H%HPTPZ[[[JJbeiijjj7""r'   c           
      `    | j         j        d| j        d| j        d| j        d| j        d
S )zGet a string representation of the current object.

        Returns
        -------
        str
            Human readable representation of the most important model parameters.

        z<num_terms=z, num_topics=z, decay=z, chunksize=>)rS   rT   r{   rn   r   ro   r:   s    r%   __str__zLdaModel.__str__b  s@     N###T^^^T___djjjRVR`R`R`
 	
r'   c                     || j                                         }t          j        |          | _        | j        j        | j        k    sJ dS )a  Propagate the states topic probabilities to the inner object's attribute.

        Parameters
        ----------
        current_Elogbeta: numpy.ndarray
            Posterior probabilities for each topic, optional.
            If omitted, it will get Elogbeta from state.

        N)r   rK   r   r   r   r.   )r4   current_Elogbetas     r%   
sync_statezLdaModel.sync_stateo  sR      	9#z66886"233%333333r'   c                 "    d| _         d| _        dS )zTClear the model's state to free some memory. Used in the distributed implementation.N)r   Elogbetar:   s    r%   clearzLdaModel.clear~  s    
r'   c           	      (   	 t          |           n# t          $ r t          |          }Y nw xY wt          |          dk    r(t                              dt          |                     | j                            ddt          |          | j        f                              | j	        d          }t          |          }t          j        |          }|j	        | j	        k    sJ |j	        | j	        k    sJ |r!t          j        | j        | j	                  }nd}d	}t          t          j        f}t          j        | j	                  j        }	t'          |          D ]\  }
}t          |          d	k    r)t)          |d	         d	         |          sd
 |D             }nd |D             }t          j        d |D             | j	        t          |                    }||
ddf         }||
ddf         }||
ddf         }| j        dd|f         }t          j        ||          |	z   }t/          | j                  D ]}|}| j        |t          j        ||z  |j                  z  z   }t          |          }t          j        |          }t          j        ||          |	z   }t7          ||          }|| j        k     r|dz  } n|||
ddf<   |j	        | j	        k    sJ |r/|dd|fxx         t          j        |j        ||z            z  cc<   t          |          dk    r/t                              d|t          |          | j                   |r|| j        z  }|j	        | j	        k    sJ |j	        | j	        k    sJ ||fS )a  Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights)
        for each document in the chunk.

        This function does not modify the model. The whole input chunk of document is assumed to fit in RAM;
        chunking of a large corpus must be done earlier in the pipeline. Avoids computing the `phi` variational
        parameter directly using the optimization presented in
        `Lee, Seung: Algorithms for non-negative matrix factorization"
        <https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf>`_.

        Parameters
        ----------
        chunk : list of list of (int, float)
            The corpus chunk on which the inference step will be performed.
        collect_sstats : bool, optional
            If set to True, also collect (and return) sufficient statistics needed to update the model's topic-word
            distributions.

        Returns
        -------
        (numpy.ndarray, {numpy.ndarray, None})
            The first element is always returned and it corresponds to the states gamma matrix. The second element is
            only returned if `collect_sstats` == True and corresponds to the sufficient statistics for the M step.

        r   z/performing inference on a chunk of %i documentsrq   rh   Fr+   r-   Nr   c                 2    g | ]\  }}t          |          S rv   )r   r   idx_s      r%   
<listcomp>z&LdaModel.inference.<locals>.<listcomp>  s"    222FCs3xx222r'   c                     g | ]\  }}|S rv   rv   r   s      r%   r   z&LdaModel.inference.<locals>.<listcomp>  s    ---vsAs---r'   c              3       K   | ]	\  }}|V  
d S rZ   rv   )r   r   cnts      r%   r   z%LdaModel.inference.<locals>.<genexpr>  s&      55vq#s555555r'   r   z.%i/%i documents converged within %i iterations)rz   	TypeErrorr   r   r   r   r   rn   r/   r.   r   r   r   
zeros_liker   r   integerrw   eps	enumerater   r   dotr   r   rj   Tr   r   outer)r4   chunkcollect_sstatsr   	ElogthetaexpElogthetar2   	convergedinteger_typesepsilonddocidsctsgammad
ElogthetadexpElogthetadexpElogbetadphinormr   	lastgamma
meanchanges                         r%   	inferencezLdaModel.inference  s   2	 JJJJ 	  	  	 KKEEE	  u::> 	XLLJCPUJJWWW !''i#e**do9VWW^^_c_ipu^vv)%00	vi(($*,,,,!TZ//// 	]4#34:FFFFFF	 bj*(4:&&*&& %	K %	KFAs3xx!| .Js1vay-$H$H .22c222-----+55555TZsSVxxXXXC1aaa4[F"1aaa4J(AAA.M+AAAsF3L
 f]L99GCG 4?++  "	 mbfS7]LN6[6[&[[26::
 "z 2 2&==G5fiHH
 44 NIE !E!QQQ$K<4:---- K qqq#v"(=?C'M"J"JJu::> 	sLLI9VYZ_V`V`bfbqrrr 	.
 d&&F<4:----{dj((((f}s    ..c                     || j         }|                     |d          \  }}|xj        |z  c_        |xj        |j        d         z  c_        |j        | j        k    sJ |S )aw  Perform inference on a chunk of documents, and accumulate the collected sufficient statistics.

        Parameters
        ----------
        chunk : list of list of (int, float)
            The corpus chunk on which the inference step will be performed.
        state : :class:`~gensim.models.ldamodel.LdaState`, optional
            The state to be updated with the newly accumulated sufficient statistics. If none, the models
            `self.state` is updated.

        Returns
        -------
        numpy.ndarray
            Gamma parameters controlling the topic weights, shape (`len(chunk)`, `self.num_topics`).

        NTr   r   )r   r   r2   r3   r5   r.   )r4   r   r   r   r2   s        r%   do_estepzLdaModel.do_estep  sl    "  	JEuTBBvQ'{dj((((r'   c                 d   t          t          |                    }t          d |D                       |z  }|j        | j        k    sJ t	          | j        |||          | _        t                              dt          | j                             | j        j        | j        k    sJ | j        S )aZ  Update parameters for the Dirichlet prior on the per-document topic weights.

        Parameters
        ----------
        gammat : numpy.ndarray
            Previous topic weight parameters.
        rho : float
            Learning rate.

        Returns
        -------
        numpy.ndarray
            Sequence of alpha parameters.

        c              3   4   K   | ]}t          |          V  d S rZ   r   )r   r   s     r%   r   z(LdaModel.update_alpha.<locals>.<genexpr>  s+      GGu+E22GGGGGGr'   zoptimized alpha %s)	floatrz   r   r.   r&   rj   r   rA   r   )r4   gammatr   r   r   s        r%   update_alphazLdaModel.update_alpha  s      #f++GGGGGGG!K}
****%dj!WcBB
($tz*:*:;;;z4:----zr'   c                 8   t          |j        d                   }t          d |D                       |z                      | j        f          }|j        | j        k    sJ t          | j        |||          | _        | j        j        | j        k    sJ | j        S )aO  Update parameters for the Dirichlet prior on the per-topic word weights.

        Parameters
        ----------
        lambdat : numpy.ndarray
            Previous lambda parameters.
        rho : float
            Learning rate.

        Returns
        -------
        numpy.ndarray
            The updated eta parameters.

        r   c              3   4   K   | ]}t          |          V  d S rZ   r   )r   lambda_s     r%   r   z&LdaModel.update_eta.<locals>.<genexpr>1  s+      MM',W55MMMMMMr'   )r   r5   r   reshaper{   r.   r&   r0   )r4   lambdatr   r   r   s        r%   
update_etazLdaModel.update_eta   s      '-"##MMWMMMMMPQQZZ\`\j[lmm}
****#DHa#>>x~++++xr'   c                 :   |t          |          }t          d |D                       }d|z  t          |          z  }|                     ||          ||z  z  }t                              d|t          j        |           t          |          |           |S )aS  Calculate and return per-word likelihood bound, using a chunk of documents as evaluation corpus.

        Also output the calculated statistics, including the perplexity=2^(-bound), to log at INFO level.

        Parameters
        ----------
        chunk : list of list of (int, float)
            The corpus chunk on which the inference step will be performed.
        total_docs : int, optional
            Number of docs used for evaluation of the perplexity.

        Returns
        -------
        numpy.ndarray
            The variational bound score calculated for each word.

        Nc              3   *   K   | ]}|D ]	\  }}|V  
d S rZ   rv   )r   documentr   r   s       r%   r   z*LdaModel.log_perplexity.<locals>.<genexpr>M  s3      LL88LLC3LLLLLLLr'   r@   )subsample_ratiozf%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words)rz   r   boundr   rA   r   exp2)r4   r   
total_docscorpus_wordsr
  perwordbounds         r%   log_perplexityzLdaModel.log_perplexity9  s    $  	$UJLLuLLLLL
*SZZ7zz%zII__kMklt"'<-00#e**l	
 	
 	
 r'   c                 
     j          j        | j        }| j        }| j        }| j        }|	 j        }		 t          |          }nC# t          $ r6 t          
                    d           t          d |D                       }Y nw xY w|dk    rt          
                    d           dS t          | j                   j        xj        |z  c_        |r/d}|dk    r|dz  }n|d	z  }t          || j        z  z            }nd
}|}t          ||pd j        z  z            }t#          d||z            }t                              d| j        ||||||		  	         ||z  dk     rt          
                    d            fd} j        rBt+           j                  }|                                t/          t0                     _        t5          |          D ]_ j        r@t                              d j                    j                             j                   n*t;           j         j        j        j          j!                  }d}d}tE          j#        ||
 j!                  }tI          |          D ]\  }}|t          |          z  }|r1||k    s|dz   | j        z  z  dk    r %                    ||            j        rKt                              d|z  t          |          z   |            j        &                    |           nkt                              d|z  t          |          z   |            '                    ||          } j(        r )                    | |                       d}~|r|dz   | j        z  z  dk    rˉ j        r3t                              d            j        *                                } +                     |            |dk               ~ j        r:t                              d            j                             j                   n*t;           j         j        j        j          j!                  }d}||k    rtY          d           j        rO|-                              }|.                                D ]%\  }} j        |         /                    |           &|r` j        r3t                              d            j        *                                } +                     |            |dk               ~d}adS )ac  Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until
        the maximum number of allowed iterations is reached. `corpus` must be an iterable.

        In distributed mode, the E step is distributed over a cluster of machines.

        Notes
        -----
        This update also supports updating an already trained model (`self`) with new documents from `corpus`;
        the two models are then merged in proportion to the number of old vs. new documents.
        This feature is still experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of `'Online Learning for LDA' by Hoffman et al.`_
        and is guaranteed to converge for any `decay` in (0.5, 1].
        Additionally, for smaller corpus sizes,
        an increasing `offset` may be beneficial (see Table 1 in the same paper).

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`) used to update the
            model.
        chunksize :  int, optional
            Number of documents to be used in each training chunk.
        decay : float, optional
            A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
            when each new document is examined. Corresponds to :math:`\kappa` from
            `'Online Learning for LDA' by Hoffman et al.`_
        offset : float, optional
            Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
            Corresponds to :math:`\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_
        passes : int, optional
            Number of passes through the corpus during training.
        update_every : int, optional
            Number of documents to be iterated through for each update.
            Set to 0 for batch learning, > 1 for online iterative learning.
        eval_every : int, optional
            Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
        iterations : int, optional
            Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
        gamma_threshold : float, optional
            Minimum change in the value of the gamma parameters to continue iterating.
        chunks_as_numpy : bool, optional
            Whether each chunk passed to the inference step should be a numpy.ndarray or not. Numpy can in some settings
            turn the term IDs into floats, these will be converted back into integers in inference, which incurs a
            performance hit. For distributed computing it may be desirable to keep the chunks as `numpy.ndarray`.

        Nz4input corpus stream has no len(); counting documentsc              3      K   | ]}d V  dS )r   Nrv   )r   r   s     r%   r   z"LdaModel.update.<locals>.<genexpr>  s"      ..!A......r'   r   z-LdaModel.update() called with an empty corpusonliner   z (single-pass)z (multi-pass)batchzrunning %s LDA training, %s topics, %i passes over the supplied corpus of %i documents, updating model once every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %frf   zxtoo few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracyc                  B    t          z   j         z  z              S rZ   )powr   )ro   r   r   pass_r4   s   r%   r   zLdaModel.update.<locals>.rho  s&    v~)9I)EFOOOr'   zinitializing %s workersF)as_numpyr.   )r  z5PROGRESS: pass %i, dispatching documents up to #%i/%iz%PROGRESS: pass %i, at document #%i/%iTzFreached the end of input; now waiting for all remaining jobs to finishzinitializing workerszIinput corpus size changed during training (don't use generators as input))0r   r   r   r   r   r   r   rz   r   r   r   r   minro   r   r3   r   r|   rA   rn   r   r   	set_modelr   r   metricsr   r   r;   r)   r0   r2   r5   r.   r   grouperr   r  putjobr   r   r   getstatedo_mstepr   on_epoch_enditemsappend)r4   r   ro   r   r   r   r   r   r   r   rr   	lencorpus
updatetypeupdateafter	evalafterupdates_per_passr   callbackr=   dirtyreallenchunkschunk_nor   r   current_metricsmetricvaluer  s   ` ```                       @r%   r   zLdaModel.updateV  s#   h  	JE 	![F 	![F 	-,L 	)J 	)J 	3"2O	/FII 	/ 	/ 	/NNQRRR..v.....III	/ > 	NNJKKKF 	7It~66I
i' 		$!J{ ...

o-
i)G))STTKK J#K	JO!t#F#RSS	q)k"9::? J	
 	
 	
 f$r) 	NN]  	P 	P 	P 	P 	P 	P 	P 	P 	P > 	-//Ht$$$&t,,DL6]] F	 F	E P5tGGG%%dj1111 4:+<+BDJOOEG]69VZV`aaaF#,V#4#4 )" )"%3u::% EGy$8 Ex!|PZ]a]lPl>mqr>r E'')'DDD? 9KKOx)3c%jj@)  
 O**51111KK?x)3c%jj@)   "]]5%88F* 9))&##%%888   "X\lT_6T$UYZ$Z " ;$lmmm $ 8 8 : :MM##%%	::: X$:;;;--dj9999 (4:3D3JDJ W W!E)# p"#nooo ~ 7"*"7"7">">%4%:%:%<%< 7 7MFEL(//6666 ? 7KK hiii O4466EcceeUEAI666MF	 F	s   A =BBc                 b   t                               d           | j                                        }| j                            ||           | j                                        }|                     |           |                     d           t          |                                |                                          }t           	                    d||           | j
        r-|                     | j                                        |           |s| xj        |j        z  c_        dS dS )a  Maximization step: use linear interpolation between the existing topics and
        collected sufficient statistics in `other` to update the topics.

        Parameters
        ----------
        rho : float
            Learning rate.
        other : :class:`~gensim.models.ldamodel.LdaModel`
            The model whose sufficient statistics will be used to update the topics.
        extra_pass : bool, optional
            Whether this step required an additional pass over the corpus.

        zupdating topics   ztopic diff=%f, rho=%fN)r   r   r   rK   rE   r   print_topicsr   ravelrA   r   r  rI   r   r3   )r4   r   r=   
extra_passprevious_Elogbetar   diffs          r%   r  zLdaModel.do_mstep  s    	&''' !J3355
e$$$:2244())) 	!'(9(?(?(A(ACSCYCYC[C[\\+T3777 	:OODJ1133S999 	.-	. 	.r'   c           
         d}| j                                         }t          |          t          |          D ]K\  }}|| j        z  dk    rt
                              d|           ||                     |g          \  }}	n||         }t          |          |j        | j        k    sJ j        | j        k    sJ |t          fd|D                       z  }|t          j	        | j        |z
  z            z  }|t          j	        t          |          t          | j                  z
            z  }|t          t          j	        | j                            t          t          j	        |                    z
  z  }M||z  }|t          j	        | j        |z
  z            z  }|t          j	        t          |          t          | j                  z
            z  }t          j        | j                  dk    r| j        | j        z  }
nt          j	        | j                  }
|t          j	        t          |
          t          t          j	        |d                    z
            z  }|S )a  Estimate the variational bound of documents from the corpus as E_q[log p(corpus)] - E_q[log q(corpus)].

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`) used to estimate the
            variational bounds.
        gamma : numpy.ndarray, optional
            Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
        subsample_ratio : float, optional
            Percentage of the whole corpus represented by the passed `corpus` argument (in case this was a sample).
            Set to 1.0 if the whole corpus was passed.This is used as a multiplicative factor to scale the likelihood
            appropriately.

        Returns
        -------
        numpy.ndarray
            The variational bound score calculated for each document.

        r8   r   zbound: at document #%iNc           
   3   v   K   | ]3\  }}|t          d d t          |          f         z             z  V  4d S rZ   )r   r   )r   idr   r   r   s      r%   r   z!LdaModel.bound.<locals>.<genexpr>b  sO      __PWPRTWyhqqq#b''z6J)JKKK______r'   r   )r   rI   r   r   ro   r   r   r   r.   r   r   rj   r   r0   ndimr{   )r4   r   r   r
  score_lambdar   r   r   r   sum_etar   r   s              @@r%   r  zLdaModel.bound<  sM   * *''))(11'' 	K 	KFAs4>!Q& :5q999 " NNC511	q.v66J<4:----#tz1111 S_____[^______E RVTZ&0J>???ERVGFOOgdj.A.AABBBEWRVDJ//00726&>>3J3JJJEE 	  	G+x7888((748+<+<<===748! 	'h/GGfTX&&G((726'13E3E+F+FFGGGr'   Tc                     |dk     s| j         k    r j         }t          |          }nt          | j                   } j        d j                            t           j                            z  z   }t          t          j	        |                    }|d|dz           || dz  d         z   }g } j
                                        }	|D ]}
|	|
                                         z  t          j	        |d          } fd|D             |rd                    d	 D                       |                    |
f           |r(t                              d
|
 j        |
                    |S )a*  Get a representation for selected topics.

        Parameters
        ----------
        num_topics : int, optional
            Number of topics to be returned. Unlike LSA, there is no natural ordering between the topics in LDA.
            The returned topics subset of all topics is therefore arbitrary and may change between two LDA
            training runs.
        num_words : int, optional
            Number of words to be presented for each topic. These will be the most relevant words (assigned the highest
            probability for each topic).
        log : bool, optional
            Whether the output is also logged, besides being returned.
        formatted : bool, optional
            Whether the topic representations should be formatted as strings. If False, they are returned as
            2 tuples of (word, probability).

        Returns
        -------
        list of {str, tuple of (str, float)}
            a list of topics, each represented either as a string (when `formatted` == True) or word-probability
            pairs.

        r   g-C6?N   Treversec                 :    g | ]}j         |         |         fS rv   rm   )r   r9  r4   topic_s     r%   r   z(LdaModel.show_topics.<locals>.<listcomp>  s)    EEEt|B'4EEEr'   z + c              3   *   K   | ]\  }}d ||fz  V  dS )z	%.3f*"%s"Nrv   )r   kvs      r%   r   z'LdaModel.show_topics.<locals>.<genexpr>  s/      #L#LTQK1a&$8#L#L#L#L#L#Lr'   ztopic #%i (%.3f): %s)rn   r   r  rj   r   randrz   r   r	   argsortr   rI   r   joinr"  r   rA   )r4   rn   	num_wordslog	formattedchosen_topics
sort_alphasorted_topicsshowntopicr   bestnrD  s   `           @r%   show_topicszLdaModel.show_topicsz  s   2 > 	_Z4?: 	_J!*--MMZ99J ft/@/E/Ec$*oo/V/V&VVJ !!1*!=!=>>M)*::?*:;mZK[\L\L]L]>^^M
%%'' 
	N 
	NA1XFfjjll*F$VYEEEEEEEEEuEEEF M#L#LV#L#L#LLLLL!V%%% N2Atz!}fMMMr'   c                 H      fd                      ||          D             S )aK  Get the representation for a single topic. Words here are the actual strings, in constrast to
        :meth:`~gensim.models.ldamodel.LdaModel.get_topic_terms` that represents words by their vocabulary ID.

        Parameters
        ----------
        topicid : int
            The ID of the topic to be returned
        topn : int, optional
            Number of the most significant words that are associated with the topic.

        Returns
        -------
        list of (str, float)
            Word - probability pairs for the most relevant words generated by the topic.

        c                 4    g | ]\  }}j         |         |fS rv   rC  )r   r9  r/  r4   s      r%   r   z'LdaModel.show_topic.<locals>.<listcomp>  s)    ___ib%b!5)___r'   )get_topic_terms)r4   topicidtopns   `  r%   
show_topiczLdaModel.show_topic  s2    " `___4;O;OPWY];^;^____r'   c                 z    | j                                         }||                    d          dddf         z  S )zGet the term-topic matrix learned during inference.

        Returns
        -------
        numpy.ndarray
            The probability for each word in each topic, shape (`num_topics`, `vocabulary_size`).

        r   )axisN)r   rI   r   )r4   topicss     r%   
get_topicszLdaModel.get_topics  s=     &&((


**111d7333r'   c                     |                                  |                                         z  t          j        |d          }fd|D             S )a<  Get the representation for a single topic. Words the integer IDs, in constrast to
        :meth:`~gensim.models.ldamodel.LdaModel.show_topic` that represents words by the actual strings.

        Parameters
        ----------
        topicid : int
            The ID of the topic to be returned
        topn : int, optional
            Number of the most significant words that are associated with the topic.

        Returns
        -------
        list of (int, float)
            Word ID - probability pairs for the most relevant words generated by the topic.

        Tr@  c                 $    g | ]}||         fS rv   rv   )r   r   rR  s     r%   r   z,LdaModel.get_topic_terms.<locals>.<listcomp>  s"    333ceCj!333r'   )r^  r   r	   rI  )r4   rX  rY  rS  rR  s       @r%   rW  zLdaModel.get_topic_terms  s\    " !!'*		# d;;;3333U3333r'   u_mass   c           
      J    t           |||||||          }|                                }	g }
                                 D ]=t          j        |d          } fd|D             }|
                    |           >t          |
|	          }t          |d d          S )a  Get the topics with the highest coherence score the coherence for each topic.

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Corpus in BoW format.
        texts : list of list of str, optional
            Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
            probability estimator .
        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Gensim dictionary mapping of id word to create corpus.
            If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
        window_size : int, optional
            Is the size of the window to be used for coherence measures using boolean sliding window as their
            probability estimator. For 'u_mass' this doesn't matter.
            If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
        coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
            Coherence measure to be used.
            Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
            For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
            using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
        topn : int, optional
            Integer corresponding to the number of top words to be extracted from each topic.
        processes : int, optional
            Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
            num_cpus - 1.

        Returns
        -------
        list of (list of (int, str), float)
            Each element in the list is a pair of a topic representation and its coherence score. Topic representations
            are distributions of words, represented as a list of pairs of word IDs and their probabilities.

        )modelr   texts
dictionarywindow_size	coherencerY  	processesT)rY  rA  c                 :    g | ]}|         j         |         fS rv   rC  )r   _idr4   rR  s     r%   r   z'LdaModel.top_topics.<locals>.<listcomp>  s)    HHH3c
DL$56HHHr'   c                     | d         S )Nr   rv   )tups    r%   <lambda>z%LdaModel.top_topics.<locals>.<lambda>  s
    SV r'   )keyrA  )r   get_coherence_per_topicr^  r	   rI  r"  zipsorted)r4   r   rf  rg  rh  ri  rY  rj  cmcoherence_scores
str_topicsrS  beststrscored_topicsrR  s   `             @r%   
top_topicszLdaModel.top_topics  s    H vUz#yt
 
 

 5577
__&& 	' 	'E$UtDDDEHHHHH%HHHGg&&&&J(899m););TJJJJr'   c                 *   | j         t          d          || j         }t          |d          }t          j        |          \  }}|r t	          ||          } | j        |fi |S |                     |g|          \  }}	|d         t          |d                   z  }
fdt          |
          D             }|s|S g }g }|D ]\  }}g }g }t          d| j
                  D ]Z}|	|         |         |k    rF|                    |	|         |         |f           |                    ||	|         |         f           [|                    ||f           t          |d          }d	 |D             }|                    ||f           |||fS )
a  Get the topic distribution for the given document.

        Parameters
        ----------
        bow : corpus : list of (int, float)
            The document in BOW format.
        minimum_probability : float
            Topics with an assigned probability lower than this threshold will be discarded.
        minimum_phi_value : float
            If `per_word_topics` is True, this represents a lower bound on the term probabilities that are included.
             If set to None, a value of 1e-8 is used to prevent 0s.
        per_word_topics : bool
            If True, this function will also return two extra lists as explained in the "Returns" section.

        Returns
        -------
        list of (int, float)
            Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and
            the probability that was assigned to it.
        list of (int, list of (int, float), optional
            Most probable topics per word. Each element in the list is a pair of a word's id, and a list of
            topics sorted by their relevance to this word. Only returned if `per_word_topics` was set to True.
        list of (int, list of float), optional
            Phi relevance values, multiplied by the feature length, for each word-topic combination.
            Each element in the list is a pair of a word's id and a list of the phi values between this word and
            each topic. Only returned if `per_word_topics` was set to True.

        N:0yE>)r   r   r   r   r   c                 *    g | ]\  }}|k    ||fS rv   rv   )r   rX  
topicvaluer   s      r%   r   z0LdaModel.get_document_topics.<locals>.<listcomp>M  s>     
 
 
&9gz00
j!
 
 
r'   Tr@  c                     g | ]
}|d          S )r   rv   r   xs     r%   r   z0LdaModel.get_document_topics.<locals>.<listcomp>f  s    ===aQqT===r'   )r   r|   r   	is_corpusdict_applyr   r   r   r   rn   r"  rs  )r4   bowr   r   r   r  r   rX   r   phis
topic_distdocument_topics
word_topicword_phi	word_typeweight
phi_values	phi_topictopic_idsorted_phi_valuestopics_sorteds     `                  r%   get_document_topicszLdaModel.get_document_topics  s"   <  	;"&":!"5t<< 	9 $ 8 1488 "OC00	6 	1 /$7"3  F
 4;v00000nncU?nKKt1XE!H-

 
 
 
=Fz=R=R
 
 

  	#""
!$ 	: 	:IvJI!!T_55 L L>),0AA L %%tH~i'@(&KLLL$$hXy0I%JKKK OOY	2333 !'z4 @ @ @==+<===My-89999
H44r'   c                 b   || j         }t          |d          }t          |t                    r'| j                            |g          d         d         }g }t          d| j                  D ]A}| j        |         |         |k    r(|	                    || j        |         |         f           B|S )a  Get the most relevant topics to the given word.

        Parameters
        ----------
        word_id : int
            The word for which the topic distribution will be computed.
        minimum_probability : float, optional
            Topics with an assigned probability below this threshold will be discarded.

        Returns
        -------
        list of (int, float)
            The relevant topics represented as pairs of their ID and their assigned probability, sorted
            by relevance to the given word.

        Nr{  r   )
r   r|   r   r   rm   doc2bowr   rn   r   r"  )r4   word_idr   valuesr  s        r%   get_term_topicszLdaModel.get_term_topicsk  s    "  	;"&":!"5t<< gs## 	<l**G955a8;Ga11 	O 	OH)'26II Ox)9()CG)LMNNNr'   r
   c                     t           t          t          t          d}||vrSd                    d |                                D                       }	t          d                    |	                    t           j	                  s't          d                     j
                            ||         }
                                                                 }}|j        d         |j        d         }}d} fdt          |          D             }fd	t          |          D             }|d
k    r||}}|rB||k    s
J d            t          j        |          }|rt          j        |t                     }n5t          j        ||f          }|rt          j        ||ft                     }t          j        |j                  D ]}|d         }|r|}n|d         } |
||         ||                   ||<   |r||         ||         z  }||                             ||                   }t!          |          dt'          t)          |          |                   }t!          |          dt'          t)          |          |                   }||g||<   |rAt          j        t          j        |                    dk    r|t          j        |          z  }||fS )a  Calculate the difference in topic distributions between two models: `self` and `other`.

        Parameters
        ----------
        other : :class:`~gensim.models.ldamodel.LdaModel`
            The model which will be compared against the current object.
        distance : {'kullback_leibler', 'hellinger', 'jaccard', 'jensen_shannon'}
            The distance metric to calculate the difference with.
        num_words : int, optional
            The number of most relevant words used if `distance == 'jaccard'`. Also used for annotating topics.
        n_ann_terms : int, optional
            Max number of words in intersection/symmetric difference between topics. Used for annotation.
        diagonal : bool, optional
            Whether we need the difference between identical topics (the diagonal of the difference matrix).
        annotation : bool, optional
            Whether the intersection or difference of words between two topics should be returned.
        normed : bool, optional
            Whether the matrix should be normalized or not.

        Returns
        -------
        numpy.ndarray
            A difference matrix. Each element corresponds to the difference between the two topics,
            shape (`self.num_topics`, `other.num_topics`)
        numpy.ndarray, optional
            Annotation matrix where for each pair we include the word from the intersection of the two topics,
            and the word from the symmetric difference of the two topics. Only included if `annotation == True`.
            Shape (`self.num_topics`, `other_model.num_topics`, 2).

        Examples
        --------
        Get the differences between each pair of topics inferred by two models

        .. sourcecode:: pycon

            >>> from gensim.models.ldamulticore import LdaMulticore
            >>> from gensim.test.utils import datapath
            >>>
            >>> m1 = LdaMulticore.load(datapath("lda_3_0_1_model"))
            >>> m2 = LdaMulticore.load(datapath("ldamodel_python_3_5"))
            >>> mdiff, annotation = m1.diff(m2)
            >>> topic_diff = mdiff  # get matrix with difference for each topic pair from `m1` and `m2`

        )r
   r   jaccardr   z, c              3   @   K   | ]}d                      |          V  dS )z`{}`N)formatr  s     r%   r   z LdaModel.diff.<locals>.<genexpr>  s.      "N"N6==#3#3"N"N"N"N"N"Nr'   z!Incorrect distance, valid only {}z*The parameter `other` must be of type `{}`r   Nc                 R    g | ]#}d                       |          D             $S )c                     h | ]\  }}|S rv   rv   r   wr   s      r%   	<setcomp>z+LdaModel.diff.<locals>.<listcomp>.<setcomp>  s    NNNVaqNNNr'   rY  rZ  )r   rR  rK  r4   s     r%   r   z!LdaModel.diff.<locals>.<listcomp>  s9    kkkSXNNtu9'M'MNNNkkkr'   c                 R    g | ]#}d                       |          D             $S )c                     h | ]\  }}|S rv   rv   r  s      r%   r  z+LdaModel.diff.<locals>.<listcomp>.<setcomp>  s    OOOVaqOOOr'   r  r  )r   rR  rK  r=   s     r%   r   z!LdaModel.diff.<locals>.<listcomp>  s<    lllTYOOu'7'7I'7'N'NOOOlllr'   r  zgBoth input models should have same no. of topics, as the diagonal will only be valid in a square matrixr-   r   r{  )r
   r   r   r   rJ  r}   rx   r  r   rS   rT   r^  r5   r   r   r1   r   ndindexsymmetric_differencer  rz   absr|   )r4   r=   distancerK  n_ann_termsdiagonal
annotationnormed	distances
valid_keysdistance_funcd1d2t1_sizet2_sizeannotation_terms
fst_topics
snd_topicszrR  topic1topic2
pos_tokens
neg_tokenss   `` `                    r%   r6  zLdaModel.diff  s.   ^ !1"',	
 
	 9$ 	U"N"NY^^=M=M"N"N"NNNJ@GG
SSTTT%00 	aIPPQUQ^__```!(+""E$4$4$6$6B8A;kkkkk\abi\j\jkkk
lllll]bcj]k]klll
y  	,B 	Lg% H HHH H H !!A A#%8G4#@#@#@  '7+,,A L#%8Wg,>d#K#K#K  Z(( 	C 	CE1XF "q$}RZF<<AeH C'/*V2DD
'/DDZPVEWXX
!*--.Ps3z??K/P/P.PQ
!*--.Ps3z??K/P/P.PQ
+5z*B ' 	vbfQii  4' RVAYY"""r'   c                 F    |                      ||| j        | j                  S )a  Get the topic distribution for the given document.

        Wraps :meth:`~gensim.models.ldamodel.LdaModel.get_document_topics` to support an operator style call.
        Uses the model's current state (set using constructor arguments) to fill in the additional arguments of the
        wrapper method.

        Parameters
        ---------
        bow : list of (int, float)
            The document in BOW format.
        eps : float, optional
            Topics with an assigned probability lower than this threshold will be discarded.

        Returns
        -------
        list of (int, float)
            Topic distribution for the given document. Each topic is represented as a pair of its ID and the probability
            assigned to it.

        )r  r   r   )r4   r  r   s      r%   __getitem__zLdaModel.__getitem__  s$    * ''S$2H$J^___r'   r   r   c                 >   | j         ) | j         j        t          j        |d          g|R i | d|vr-t          j        | j        t          j        |d                     |H|rFt          |t                    r|g}d |D             }t          h dt          |          z            }ng d}dd	g}t          | j
        t                    r| j
        d
k    s<t          | j
        t          j                  r2t          | j
        j                  dk    r|                    d           t          | j        t                    r| j        d
k    s<t          | j        t          j                  r2t          | j        j                  dk    r|                    d           |rQt          |t                    r|g}d |D             }t          t          |          t          |          z            }n|} t#          t$          |           j        |g|R ||d| dS )a  Save the model to a file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        Notes
        -----
        If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

          1. The pickled Python dictionaries will not work across Python versions
          2. The `save` method does not automatically save all numpy arrays separately, only
             those ones that exceed `sep_limit` set in :meth:`~gensim.utils.SaveLoad.save`. The main
             concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the `wiki recipes section
        <https://github.com/RaRe-Technologies/gensim/wiki/
        Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2>`_
        for an example on how to work around these issues.

        See Also
        --------
        :meth:`~gensim.models.ldamodel.LdaModel.load`
            Load model.

        Parameters
        ----------
        fname : str
            Path to the system file where the model will be persisted.
        ignore : tuple of str, optional
            The named attributes in the tuple will be left out of the pickled model. The reason why
            the internal `state` is ignored by default is that it uses its own serialisation rather than the one
            provided by this method.
        separately : {list of str, None}, optional
            If None -  automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
            them into separate files. This avoids pickle memory errors and allows `mmap`'ing large arrays
            back on load efficiently. If list of str - this attributes will be stored in separate files,
            the automatic check is not performed in this case.
        *args
            Positional arguments propagated to :meth:`~gensim.utils.SaveLoad.save`.
        **kwargs
            Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`.

        N.staterm   .id2wordc                     g | ]}||S rv   rv   r   es     r%   r   z!LdaModel.save.<locals>.<listcomp>F  s    ---A1-a---r'   >   r   rm   r   )r   r   rm   r   r2   r   r   rj   r0   c                     g | ]}||S rv   rv   r  s     r%   r   z!LdaModel.save.<locals>.<listcomp>Z  s    55515!555r'   )ignore
separately)r   saver   smart_extensionpicklerm   r   r   r   setrj   r   r   rz   r5   r"  r0   rN   rc   )r4   rV   r  r  rW   rX   separately_explicitrS   s          r%   r  zLdaModel.save  sM   X : 	UDJOE1%BBTTTTTVTTTF" 	QLu'<UJ'O'OPPP  	8& 	8&#&& " -----F<<<s6{{JKKFF777F  -h7 tz3'' 	0DJ&,@ 	0DJ
33	08;DJ<L8M8MQR8R	0&&w///tx%% 	.$(f*< 	.DHbj11	.69$(.6I6IQ6N	.&&u--- 	-*c** *(\
55Z555Jc"566ZHIIJJ,J"h"5`QU``J``Y_`````r'   c                 2   |                     dd          |d<    t          t          |           j        |g|R i |}t	          |d          s-t          j        d          |_        t          j	        d           t	          |d          s1t          j        |_        t          j        d|j        j        |           t          j        |d          }	 t#          j        |g|R i ||_        n-# t&          $ r }t          j	        d||           Y d}~nd}~ww xY wt          j        |d	          }t(          j                            |          rH	 t          j        |          |_        n-# t&          $ r }t          j	        d
||           Y d}~nd}~ww xY w|S )aP  Load a previously saved :class:`gensim.models.ldamodel.LdaModel` from file.

        See Also
        --------
        :meth:`~gensim.models.ldamodel.LdaModel.save`
            Save model.

        Parameters
        ----------
        fname : str
            Path to the file where the model is stored.
        *args
            Positional arguments propagated to :meth:`~gensim.utils.SaveLoad.load`.
        **kwargs
            Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.load`.

        Examples
        --------
        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>>
            >>> fname = datapath("lda_3_0_1_model")
            >>> lda = LdaModel.load(fname, mmap='r')

        mmapNr   z+random_state not set so using default valuer.   rM   r  z failed to load state from %s: %sr  z-failed to load id2word dictionary from %s: %s)getrN   rc   rO   rP   r   r   r   rR   r   r   rQ   r.   rA   rS   rT   r  r)   r   r   ospathisfileunpicklerm   )	rU   rV   rW   rX   rY   state_fnamer  id2word_fnamerS   s	           r%   rO   zLdaModel.load`  s   <  FD11v*x%%*5B4BBB6BB v~.. 	K"'"8">">FOIJJJ vw'' 	y:FLLUW]WgWprwxxx+E8<<	P#=FtFFFvFFFLL 	P 	P 	PO>QOOOOOOOO	P -eZ@@ 7>>-(( 	cc!&!>!> c c c OQ^`abbbbbbbbcs0   C. .
D8DDE* *
F4FFrZ   )F)	NNNNNNNNF)Nr@   )rf   rf   FT)rf   )NNNNra  rb  rc  )NNF)r
   rd   rf   FTT)r  N) rT   r[   r\   r]   r   r^   r6   r   r   r   r   r   r   r   r  r  r   r  r  rT  rZ  r^  rW  ry  r  r  r6  r  r  r_   rO   r`   ra   s   @r%   rc   rc   <  s          B #sD"d11"CPR4"DD!&$bjo o o obQ# Q# Q#f
 
 
4 4 4 4  
h h h hT   2  4  2   : AEKO5:A A A AF!. !. !. !.F< < < <|5 5 5 5n` ` ` `&
4 
4 
44 4 4 4, PT:<2K 2K 2K 2Kh TX,1O5 O5 O5 O5b   @ BEEIk# k# k# k#Z` ` ` `.Oa Oa Oa Oa Oa Oab > > > > [> > > > >r'   rc   )'r]   rR   r   r  r   collectionsr   numpyr   scipy.specialr   r   r   gensimr   r   r	   gensim.matutilsr
   r   r   r   r   r   r   gensim.modelsr   r   gensim.models.callbacksr   	getLoggerrT   r   r&   SaveLoadr)   TransformationABCBaseTopicModelrc   rv   r'   r%   <module>r     s  S Sj   				  # # # # # #     & & & & & & & & # # # # # # . . . . . . . . . .                  4 3 3 3 3 3 3 3 , , , , , , 
	8	$	$$ $ $N` ` ` ` `u~ ` ` `Fc c c c cz+Y-E c c c c cr'   