
    c                        d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlZddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ  ej        e          Z G d de          Zd Zd Z  G d de          Z!dS )a  Author-topic model.

This module trains the author-topic model on documents and corresponding author-document dictionaries.
The training is online and is constant in memory w.r.t. the number of documents.
The model is *not* constant in memory w.r.t. the number of authors.

The model can be updated with additional documents after training has been completed. It is
also possible to continue training on the existing data.

The model is closely related to :class:`~gensim.models.ldamodel.LdaModel`.
The :class:`~gensim.models.atmodel.AuthorTopicModel` class inherits  :class:`~gensim.models.ldamodel.LdaModel`,
and its usage is thus similar.

The model was introduced by  `Rosen-Zvi and co-authors: "The Author-Topic Model for Authors and Documents"
<https://arxiv.org/abs/1207.4169>`_. The model correlates the authorship information with the topics to give a better
insight on the subject knowledge of an author.

.. _'Online Learning for LDA' by Hoffman et al.: online-lda_
.. _online-lda: https://papers.neurips.cc/paper/2010/file/71f6278d140af599e06ad9bf1ba03cb0-Paper.pdf

Example
-------
.. sourcecode:: pycon

    >>> from gensim.models import AuthorTopicModel
    >>> from gensim.corpora import mmcorpus
    >>> from gensim.test.utils import common_dictionary, datapath, temporary_file

    >>> author2doc = {
    ...     'john': [0, 1, 2, 3, 4, 5, 6],
    ...     'jane': [2, 3, 4, 5, 6, 7, 8],
    ...     'jack': [0, 2, 4, 6, 8]
    ... }
    >>>
    >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
    >>>
    >>> with temporary_file("serialized") as s_path:
    ...     model = AuthorTopicModel(
    ...         corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
    ...         serialized=True, serialization_path=s_path
    ...     )
    ...
    ...     model.update(corpus, author2doc)  # update the author-topic model with additional documents
    >>>
    >>> # construct vectors for authors
    >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

    N)chain)deepcopy)copyfile)isfile)remove)gammaln)utils)LdaModel)LdaState)dirichlet_expectationmean_absolute_difference)MmCorpusc                       e Zd ZdZd ZdS )AuthorTopicStatez\Encapsulate information for computation of :class:`~gensim.models.atmodel.AuthorTopicModel`.c                     || _         t          j        |          | _        t          j        |          | _        d| _        t          j        | _        dS )a  

        Parameters
        ----------
        eta: numpy.ndarray
            Dirichlet topic parameter for sparsity.
        lambda_shape: (int, int)
            Initialize topic parameters.
        gamma_shape: int
            Initialize topic parameters.

        r   N)etanpzerossstatsgammanumdocsfloat64dtype)selfr   lambda_shapegamma_shapes       5lib/python3.11/site-packages/gensim/models/atmodel.py__init__zAuthorTopicState.__init__U   sA     h|,,Xk**
Z


    N)__name__
__module____qualname____doc__r    r   r   r   r   R   s)        ff         r   r   c                     i }t          |           D ]?\  }}g }|                                D ]\  }}||v r|                    |           |||<   @|S )aQ  Create a mapping from document IDs to author IDs.

    Parameters
    ----------
    corpus: iterable of list of (int, float)
        Corpus in BoW format.
    author2doc: dict of (str, list of int)
        Mapping of authors to documents.

    Returns
    -------
    dict of (int, list of str)
        Document to Author mapping.

    )	enumerateitemsappend)corpus
author2doc
doc2authord_
author_idsa	a_doc_idss           r   construct_doc2authorr1   i   sz      J&!! # #1
&,,.. 	% 	%LAyI~ %!!!$$$"
1r   c                    t                      }|                                 D ]\  }}|D ]}|                    |            i }|D ]@}g ||<   |                                 D ]$\  }}||v r||                             |           %A|S )a  Make a mapping from author IDs to document IDs.

    Parameters
    ----------
    doc2author: dict of (int, list of str)
        Mapping of document id to authors.

    Returns
    -------
    dict of (str, list of int)
        Mapping of authors to document ids.

    )setr'   addr(   )r+   authors_idsr,   r0   r/   r*   a_idss          r   construct_author2docr7      s     %%K"((**  9 	 	AOOA	 J ( (
1"((** 	( 	(HAuEz (1$$Q'''	( r   c                       e Zd ZdZ	 	 	 	 	 ddZd Zd Zd Zd ZddZ	ddZ
ddZ	 	 	 ddZd dZd!dZd!dZd!dZd!dZdS )"AuthorTopicModelzWThe constructor estimates the author-topic model parameters based on a training corpus.Nd        2         ?      ?	symmetric
   MbP?F{Gz?c                 r   t           j        | _        d}d| _        d| _        || _        || j        t          d          | j        Mt                              d           t          j
        |          | _        t          | j                  | _        nNt          | j                  dk    r/dt          | j                                                  z   | _        nd| _        | j        dk    rt          d          t                              d| j                   i | _        i | _        || _        || _        d| _        || _        |	| _        |
| _        || _        d| _        d| _        || _        || _        || _        i | _        i | _        || _         |r|st          d	          |r|rtC          |          r
J d
            || _"        | #                                 | $                    |d          \  | _%        | _&        | j%        j'        | j        fk    s+J dtQ          | j%        j'                  | j        fz              | $                    |d          \  | _)        | _*        | j)        j'        | j        fk    sS| j)        j'        | j        | j        fk    s7J dtQ          | j)        j'                  | j        | j        | j        fz              t          j+        |          | _,        || _-        || _.        t_          | j)        | j        | j        f| j        | j        f          | _0        | j,        1                    dd| j        | j        f          | j0        _2        t          j3        ti          | j0        j2                            | _5        |(||&| j        du}| 6                    ||||           dS dS dS )a  

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Corpus in BoW format
        num_topics : int, optional
            Number of topics to be extracted from the training corpus.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            A mapping from word ids (integers) to words (strings).
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of document IDs that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.
        chunksize : int, optional
            Controls the size of the mini-batches.
        passes : int, optional
            Number of times the model makes a pass over the entire training data.
        iterations : int, optional
            Maximum number of times the model loops over each document.
        decay : float, optional
            A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
            when each new document is examined. Corresponds to :math:`\kappa` from
            `'Online Learning for LDA' by Hoffman et al.`_
        offset : float, optional
            Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
            Corresponds to :math:`\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_
        alpha : {float, numpy.ndarray of float, list of float, str}, optional
            A-priori belief on document-topic distribution, this can be:
                * scalar for a symmetric prior over document-topic distribution,
                * 1D array of length equal to num_topics to denote an asymmetric user defined prior for each topic.

            Alternatively default prior selecting strategies can be employed by supplying a string:
                * 'symmetric': (default) Uses a fixed symmetric prior of `1.0 / num_topics`,
                * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / (topic_index + sqrt(num_topics))`,
                * 'auto': Learns an asymmetric prior from the corpus (not available if `distributed==True`).
        eta : {float, numpy.ndarray of float, list of float, str}, optional
            A-priori belief on topic-word distribution, this can be:
                * scalar for a symmetric prior over topic-word distribution,
                * 1D array of length equal to num_words to denote an asymmetric user defined prior for each word,
                * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination.

            Alternatively default prior selecting strategies can be employed by supplying a string:
                * 'symmetric': (default) Uses a fixed symmetric prior of `1.0 / num_topics`,
                * 'auto': Learns an asymmetric prior from the corpus.
        update_every : int, optional
            Make updates in topic probability for latest mini-batch.
        eval_every : int, optional
            Calculate and estimate log perplexity for latest mini-batch.
        gamma_threshold : float, optional
            Threshold value of gamma(topic difference between consecutive two topics)
            until which the iterations continue.
        serialized : bool, optional
            Indicates whether the input corpora to the model are simple lists
            or saved to the hard-drive.
        serialization_path : str, optional
            Must be set to a filepath, if `serialized = True` is used.
        minimum_probability : float, optional
            Controls filtering the topics returned for a document (bow).
        random_state : {int, numpy.random.RandomState}, optional
            Set the state of the random number generator inside the author-topic model.

        FNr<   zYat least one of corpus/id2word must be specified, to establish input space dimensionalityzHno word id mapping provided; initializing from corpus, assuming identityr   zIcannot compute the author-topic model over an empty collection (no terms)z Vocabulary consists of %d words.z{If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).zpA file already exists at the serialization_path path; choose a different serialization_path, or delete the file.alphaz6Invalid alpha shape. Got shape %s, but expected (%d, )r   zAInvalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)      Y@rC   )chunks_as_numpy)7r   r   r   
dispatcher
numworkersid2word
ValueErrorloggerwarningr	   dict_from_corpuslen	num_termsmaxkeysinfor*   r+   distributed
num_topicsnum_authors	chunksizedecayoffsetminimum_probabilitynum_updates
total_docspassesupdate_every
eval_every	author2id	id2author
serializedr   serialization_pathinit_empty_corpusinit_dir_priorrE   optimize_alphashapestrr   optimize_etaget_random_staterandom_state
iterationsgamma_thresholdr   stater   r   expr   expElogbetaupdate)r   r)   rU   rJ   r*   r+   rW   r]   rl   rX   rY   rE   r   r^   r_   rm   rb   rc   rZ   rk   rT   	use_numpys                         r   r   zAuthorTopicModel.__init__   s   N Z

  	dl 	k   < 	NNefff 1&99DL ..DNN" 	T\%6%6%8%8!9!99DNNDN>Q 	jhiii6GGG&$"
#6 ($$ 	0 	W    	M, 	M011 M MMM M M #5 	   *.*=*=eW*M*M'
D'zDO#55 	p 	pDDJL\H]H]_c_nGoo	p 	p 	p '+&9&9#u&E&E#$#4>"33 	
tx~$/[_[iIj7j 	
 	
O  $.$/4>RS	
 	
 	

 "2<@@ %. &dh$.0QTXTdfjfuSvww
 -33D)doW[WeEfgg
6"7
8I"J"JKK  	S: 	SZ 	St3IKK
J	KRRRRR	S 	S 	S 	Sr   c                 p    | j         j        d| j        d| j        d| j        d| j        d| j        dS )zGet a string representation of object.

        Returns
        -------
        str
            String representation of current instance.

        z<num_terms=z, num_topics=z, num_authors=z, decay=z, chunksize=>)	__class__r    rP   rU   rV   rX   rW   r   s    r   __str__zAuthorTopicModel.__str__E  sQ     ^$$$dnnndoootGWGWGWY]YcYcYceiesesesu 	ur   c                     | j         r5t          j        | j        g            t          | j                  | _        dS g | _        dS )zInitialize an empty corpus.
        If the corpora are to be treated as lists, simply initialize an empty list.
        If serialization is used, initialize an empty corpus using :class:`~gensim.corpora.mmcorpus.MmCorpus`.

        N)rb   r   	serializerc   r)   rv   s    r   rd   z"AuthorTopicModel.init_empty_corpusQ  sI     ? 	 t6;;;"4#:;;DKKK DKKKr   c                    | j         rt          |t                    r| j        j        |j        k    s
J d            t          | j        |          }t          | j        | j        dz              | j        dz   | j        _        t          j        | j        |           t          | j                  | _        t          | j        dz              dS t          |t                    s
J d            | j                            |           dS )a  Add new documents from `corpus` to `self.corpus`.

        If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents
        are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended.

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            Corpus in BoW format

        Raises
        ------
        AssertionError
            If serialized == False and corpus isn't list.

        zUInput corpus cannot have the same file path as the model corpus (serialization_path).z.tmpz8If serialized == False, all input corpora must be lists.N)rb   
isinstancer   r)   inputr   r   rc   ry   r   listextend)r   r)   corpus_chains      r   extend_corpuszAuthorTopicModel.extend_corpus`  s   " ? 	'&(++ l{(FL8 l lkl l l f55LT,d.E.NOOO $ 7& @DKt6EEE"4#:;;DK4*V344444 fd++gg-ggggKv&&&&&r   c                 b    |                     d          }|                    |          dz   }|S )a  Efficiently computes the normalizing factor in phi.

        Parameters
        ----------
        expElogthetad: numpy.ndarray
            Value of variational distribution :math:`q(\theta|\gamma)`.
        expElogbetad: numpy.ndarray
            Value of variational distribution :math:`q(\beta|\lambda)`.

        Returns
        -------
        float
            Value of normalizing factor.

        r   axisg0.++)sumdot)r   expElogthetadexpElogbetadexpElogtheta_sumphinorms        r   compute_phinormz AuthorTopicModel.compute_phinorm  s8      ),,!,44"&&|44v=r   c           	          	 t          |           n# t          $ r t          |          }Y nw xY wt          |          dk    r(t                              dt          |                     |rt          j         j                  }nd}d}t          j        d j	        f          }	t          |          D ]\  }
}|	||
         }n|
}|r:t          |d         d         t          t
          j        f          sd |D             }nd |D             }t          j        |t                    }t          j        d |D             t          t          |          	          }t          j         fd
 j        |         D             t                    } j        j        |ddf         }|                                }t)          |          }t          j        |          } j        dd|f         }                     ||          }t/           j                  D ]}|                                }t          j        ||z  |j                  }t          |          D ]I\  }} j        t           j         j        |                            ||ddf         z  |z  z   ||ddf<   Jd|z
  |z  ||z  z   }t)          |          }t          j        |          }                     ||          }t=          |                                |                                          }| j         k     }|r|dz  } n| j        j        |ddf<   t          j!        |	|g          }	|rE|"                    d          }|dd|fxx         t          j#        |j        ||z            z  cc<   t          |          dk    r/t                              d|t          |           j                   |r
| j        z  }|	|fS )a  Give a `chunk` of sparse document vectors, update gamma for each author corresponding to the `chuck`.

        Warnings
        --------
        The whole input chunk of document is assumed to fit in RAM, chunking of a large corpus must be done earlier
        in the pipeline.

        Avoids computing the `phi` variational parameter directly using the
        optimization presented in `Lee, Seung: "Algorithms for non-negative matrix factorization", NIPS 2001
        <https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf>`_.

        Parameters
        ----------
        chunk : iterable of list of (int, float)
            Corpus in BoW format.
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of document IDs that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.
        rhot : float
            Value of rho for conducting inference on documents.
        collect_sstats : boolean, optional
            If True - collect sufficient statistics needed to update the model's topic-word distributions, and return
            `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`. `gamma_chunk` is of shape
            `len(chunk_authors) x self.num_topics`,where `chunk_authors` is the number of authors in the documents in
            the current chunk.
        chunk_doc_idx : numpy.ndarray, optional
            Assigns the value for document index.

        Returns
        -------
        (numpy.ndarray, numpy.ndarray)
            gamma_chunk and sstats (if `collect_sstats == True`, otherwise - None)

        r<   z/performing inference on a chunk of %i documentsNr   c                 2    g | ]\  }}t          |          S r$   )int.0idxr-   s      r   
<listcomp>z.AuthorTopicModel.inference.<locals>.<listcomp>  s"    222FCs3xx222r   c                     g | ]\  }}|S r$   r$   r   s      r   r   z.AuthorTopicModel.inference.<locals>.<listcomp>  s    ---vsAs---r   r   c              3       K   | ]	\  }}|V  
d S Nr$   r   r-   cnts      r   	<genexpr>z-AuthorTopicModel.inference.<locals>.<genexpr>  &      55vq#s555555r   r   countc              3   2   K   | ]}j         |         V  d S r   r`   r   r/   r   s     r   r   z-AuthorTopicModel.inference.<locals>.<genexpr>  *      $X$X1T^A%6$X$X$X$X$X$Xr   r   z.%i/%i documents converged within %i iterations)$rO   	TypeErrorr}   rL   debugr   
zeros_likerp   r   rU   r&   r{   r   integerarrayfromiterr+   rn   r   copyr   ro   r   rangerl   r   TrE   r*   ra   r   ravelrm   vstackr   outer)r   chunkr*   r+   rhotcollect_sstatschunk_doc_idxr   	convergedgamma_chunkr,   docdoc_noidscts	authors_dgammadtilde_gamma
Elogthetadr   r   r   r-   	lastgammar   air/   meanchange_gammagamma_conditionexpElogtheta_sum_as   `                             r   	inferencezAuthorTopicModel.inference  sX   J	 JJJJ 	  	  	 KKEEE	  u::> 	XLLJCPUJJWWW  	]4#344FFF	 h4?344  && I	P I	PFAs &q)  .:c!fQi#rz1CDD .22c222-----(3c***C+55555SCQQQC $X$X$X$XPV@W$X$X$X`cdddIZ%il3F ++--K /{;;JF:..M+AAAsF3L **=,GGG 4?++  ',,..	 fS7]LN;;&y11  EB
dodnQ.?@AAMRTVWVWVWRWDXX[^^_  AAA&&  !4x61D;4FF 3;??
 "z 2 2 ..}lKK $<K<M<M<O<OQZQ`Q`QbQb#c#c "2T5I"I" NIE .9DJY\* )[+$>??K P &3%6%6A%6%>%>"qqq#v"(+=+?w"O"OOu::> 	LL@3u::t  
  	'
 d&&FF""s    //c                     || j         }|                     ||||d|          \  }}|xj        |z  c_        |xj        t	          |          z  c_        |S )a  Performs inference (E-step) on a chunk of documents, and accumulate the collected sufficient statistics.

        Parameters
        ----------
        chunk : iterable of list of (int, float)
            Corpus in BoW format.
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of document IDs that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.
        rhot : float
            Value of rho for conducting inference on documents.
        state : int, optional
            Initializes the state for a new E iteration.
        chunk_doc_idx : numpy.ndarray, optional
            Assigns the value for document index.

        Returns
        -------
        float
            Value of gamma for training of model.

        NTr   r   )rn   r   r   r   rO   )	r   r   r*   r+   r   rn   r   r   r   s	            r   do_estepzAuthorTopicModel.do_estep+  sj    4  	JE:z4} ' 
 
v 	U#r   c                 <   |t          |          }t          d |D                       }d|z  t          |          z  }|                     |||          ||z  z  }t                              d|t          j        |           t          |          |           |S )a  Calculate per-word likelihood bound, using the `chunk` of documents as evaluation corpus.

        Parameters
        ----------
        chunk : iterable of list of (int, float)
            Corpus in BoW format.
        chunk_doc_idx : numpy.ndarray, optional
            Assigns the value for document index.
        total_docs : int, optional
            Initializes the value for total number of documents.

        Returns
        -------
        float
            Value of per-word likelihood bound.

        Nc              3   *   K   | ]}|D ]	\  }}|V  
d S r   r$   )r   documentr-   r   s       r   r   z2AuthorTopicModel.log_perplexity.<locals>.<genexpr>d  s3      LL88LLC3LLLLLLLr   r?   )subsample_ratioz]%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words)rO   r   boundrL   rS   r   exp2)r   r   r   r\   corpus_wordsr   perwordbounds          r   log_perplexityzAuthorTopicModel.log_perplexityO  s    &  	$UJLLuLLLLL
*SZZ7zz%zXX|+-k"'<-00#e**l	
 	
 	
 r   c                 (   %&  j          j        | j        }| j        }|	 j        }	|
 j        }
| j        }t          |          }t          |          }|I j        dk    s
J d            d t           j                  D             }t           j                  }n||t          d          |t          ||          }n|t          |          }t          |          }	 t          |          %nC# t          $ r6 t                               d           t%          d |D                       %Y nw xY w%dk    rt                               d           dS  xj        %z  c_                             |           g }t)          |                                          D ]1} j                            |          s|                    |           2t          |          }t1          |          D ])\  }}| j        z    j        |<   | j        | j        z   <   * xj        |z  c_         j                            d	d
| j        f          }t?          j          j!        j        |g           j!        _        |"                                D ]\  }}% fd|D             }|"                                D ]J\  }} j                            |          r! j        |         #                    |           @| j        |<   K|"                                D ]\  }}| j$        |<   tK                      } j        &                                D ]}|'                    |           t)          |          }t          |          }tQ          | j)                   j!        xj*        |z  c_*        |rd}tQ          || j+        z  z            }nd}|}tQ          ||	pd j+        z  z            }tY          d||z            }t           -                    d| j        ||||||
|
  
         ||z  dk     rt                               d           & fd}t          |          D ]& j.        r@t           -                    d j+                    j.        /                     j!                   n%ta           j1         j!        j2        j3        d          }d}d} t1          ti          j5        ||                    D ]\  }!}" fd|"D             }#| t          |#          z  } |	r2| |k    s|!dz   |	 j+        z  z  dk    r 6                    |#|"|            j.        rKt           -                    d&|!z  t          |#          z   |            j.        7                    |#           nt           -                    d&|!z  t          |#          z   |            8                    |# j         j$         |            ||"          }$ j9        r :                    |$ |                       d}~#|r|!dz   | j+        z  z  dk    rƉ j.        r3t           -                    d            j.        ;                                } <                     |            |&dk               ~ j.        r:t           -                    d            j.        /                     j!                   n%ta           j1         j!        j2        j3        d          }d}| |k    rt{          d          |r^ j.        r3t           -                    d            j.        ;                                } <                     |            |&dk               ~dS )a  Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the
        maximum number of allowed iterations is reached).

        Notes
        -----
        This update also supports updating an already trained model (`self`) with new documents from `corpus`;
        the two models are then merged in proportion to the number of old vs. new documents.
        This feature is still experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand, this equals the
        online update of `'Online Learning for LDA' by Hoffman et al.`_
        and is guaranteed to converge for any `decay` in (0.5, 1]. Additionally, for smaller corpus sizes, an
        increasing `offset` may be beneficial (see Table 1 in the same paper).

        If update is called with authors that already exist in the model, it will resume training on not only new
        documents for that author, but also the previously seen documents. This is necessary for those authors' topic
        distributions to converge.

        Every time `update(corpus, author2doc)` is called, the new documents are to appended to all the previously seen
        documents, and author2doc is combined with the previously seen authors.

        To resume training on all the data seen by the model, simply call
        :meth:`~gensim.models.atmodel.AuthorTopicModel.update`.

        It is not possible to add new authors to existing documents, as all documents in `corpus` are assumed to be
        new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            The corpus in BoW format.
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of document IDs that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.
        chunksize : int, optional
            Controls the size of the mini-batches.
        decay : float, optional
            A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
            when each new document is examined. Corresponds to :math:`\kappa` from
            `'Online Learning for LDA' by Hoffman et al.`_
        offset : float, optional
            Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
            Corresponds to :math:`\tau_0` from `'Online Learning for LDA' by Hoffman et al.`_
        passes : int, optional
            Number of times the model makes a pass over the entire training data.
        update_every : int, optional
            Make updates in topic probability for latest mini-batch.
        eval_every : int, optional
            Calculate and estimate log perplexity for latest mini-batch.
        iterations : int, optional
            Maximum number of times the model loops over each document
        gamma_threshold : float, optional
            Threshold value of gamma(topic difference between consecutive two topics)
            until which the iterations continue.
        chunks_as_numpy : bool, optional
            Whether each chunk passed to :meth:`~gensim.models.atmodel.AuthorTopicModel.inference` should be a numpy
            array of not. Numpy can in some settings turn the term IDs into floats, these will be converted back into
            integers in inference, which incurs a performance hit. For distributed computing (not supported now)
            it may be desirable to keep the chunks as numpy arrays.

        Nr   z2update() was called with no documents to train on.c                     g | ]}|S r$   r$   )r   r,   s     r   r   z+AuthorTopicModel.update.<locals>.<listcomp>  s    BBBaBBBr   z`at least one of author2doc/doc2author must be specified, to establish input space dimensionality4input corpus stream has no len(); counting documentsc              3      K   | ]}d V  dS r<   Nr$   r   r-   s     r   r   z*AuthorTopicModel.update.<locals>.<genexpr>  s"      &9&9Qq&9&9&9&9&9&9r   z5AuthorTopicModel.update() called with an empty corpusrF   rC   c                 *    g | ]}|j         z   z
  S r$   r\   )r   r,   len_input_corpusr   s     r   r   z+AuthorTopicModel.update.<locals>.<listcomp>  s&    SSSa1t.1AASSSr   onlinebatchr<   zrunning %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %frA   zxtoo few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracyc                  B    t          z   j         z  z              S r   )powr[   )rW   rX   rY   pass_r   s   r   rhoz$AuthorTopicModel.update.<locals>.rho@  s&    v~)9I)EFOOOr   zinitializing %s workers)r   r   F)as_numpyc                 *    g | ]}j         |         S r$   )r)   )r   r,   r   s     r   r   z+AuthorTopicModel.update.<locals>.<listcomp>O  s    ???AQ???r   r   z5PROGRESS: pass %i, dispatching documents up to #%i/%iz%PROGRESS: pass %i, at document #%i/%iTzFreached the end of input; now waiting for all remaining jobs to finishzinitializing workerszIinput corpus size changed during training (don't use generators as input))>rX   rY   r]   r^   r_   rl   rm   r   r\   r   rO   r*   rK   r1   r7   r   rL   rM   r   r   sortedrR   getr(   r&   rV   r`   ra   rk   r   rU   r   r   rn   r'   r~   r+   r3   valuesrq   minrW   r   rI   rQ   rS   rH   resetr   r   r   rg   r	   grouperr   putjobr   rf   update_alphagetstatedo_mstepRuntimeError)'r   r)   r*   r+   rW   rX   rY   r]   r^   r_   rl   rm   rG   train_corpus_idxnum_input_authorsnew_authorsr/   num_new_authorsa_ida_name	gamma_newdoc_idsr,   a_list	lencorpus
updatetypeupdateafter	evalafterupdates_per_passr   otherdirtyreallenchunk_nor   r   gammatr   r   s'   `   ```                              @@r   rq   zAuthorTopicModel.updaten  s   F  	JE 	![F 	![F 	-,L 	)J 	)J 	3"2O j))
j))
  Q	8 ?Q&\\(\\\\BB5+A+ABBB #DO 4 4 j  v  
  >1&*EE

 >1*==
 !$J:#&v;;   : : :UVVV#&&9&9&&9&9&9#9#9   :  1$ VWWWOO//OO v&&& KJOO--.. * ***1-- *&&q)))!+..O !*+ 6 6 A Af)-0@)@v&:@td&6677 / )//i/SWSbAcddI!y$**:I)FGGDJ )..00 T T
7SSSSS7SSS )..00 1 1
7?&&q)) 1OA&--g6666 *1DOA&& (--// , ,	6%+""  #uu?1133 1 1 ''0000  &&677 ())	 	7It~66I
i' 	$!Ji)G))STTKK J#K	JO!t#F#RSS	q)k"9::? ):FI{z?	
 	
 	
 f$r) 	NN]  	P 	P 	P 	P 	P 	P 	P 	P 	P 6]] E	 E	E T5tGGG%%dj1111 )4:3D3JFSSEG+4M"2IXXX,Z ,Z /" /"'-???????3u::% TGy$8 Tx!|PZ]a]lPl>mqr>r T ''}'SSS? 9KKOx)3c%jj@)  
 O**51111KK?x)3c%jj@)   "]]5$/4?TWTWTYTY[`boppF* 9))&##%%888   "X\lT_6T$UYZ$Z " ;$lmmm $ 8 8 : :MM##%%	::: \$:;;;--dj9999 04:;L;RTZ [ [!E)# p"#nooo ? 7KK hiii O4466EcceeUEAI666KE	 E	s   6D =EEc                      j                                         }t          |          }t          j        |          } j         j        }	|"|  j        } j        }|st          d          ne|T|R|	                                D ]+}
 j        
                    |
          st          d          ,|rt          d          nt          d          t          |	          }t          j        |          }d}d}t          |          D ]]\  }}|r	||         }n|}t          j         fd j        |         D             t                    }t          j        d	 |D             t          t          |          
          }t          j        d |D             t          t          |          
          }| j        z  dk    rt                               d|                                ||ddf         |dd|f                   }|t          j        dt          |          z            t)          |          z  |                    t          j        |                    z   z  }_||z  }|	                                D ]}
 j        |
         }
|t          j         j        |	|
ddf         z
  ||
ddf         z            z  }|t          j        t1          |	|
ddf                   t1           j                  z
            z  }|t1          t          j         j                            t1          t          j        |	|
ddf                             z
  z  }| j        t          |          z  z  }d}|t          j         j        |z
  |z            z  }|t          j        t1          |          t1           j                  z
            z  }t          j         j                  }|t          j        t1          |          t1          t          j        |d                    z
            z  }||z   |z   }|S )a  Estimate the variational bound of documents from `corpus`.

        :math:`\mathbb{E_{q}}[\log p(corpus)] - \mathbb{E_{q}}[\log q(corpus)]`

        Notes
        -----
        There are basically two use cases of this method:

        #. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided,
           indicating the indexes of the documents in the training corpus.
        #. `chunk` is a test set (held-out data), and `author2doc` and `doc2author` corresponding to this test set
           are provided. There must not be any new authors passed to this method, `chunk_doc_idx` is not needed
           in this case.

        Parameters
        ----------
        chunk : iterable of list of (int, float)
            Corpus in BoW format.
        chunk_doc_idx : numpy.ndarray, optional
            Assigns the value for document index.
        subsample_ratio : float, optional
            Used for calculation of word score for estimation of variational bound.
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of documents that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.

        Returns
        -------
        float
            Value of variational bound score.

        NzdEither author dictionaries or chunk_doc_idx must be provided. Consult documentation of bound method.z=bound cannot be called with authors not seen during training.znEither author dictionaries or chunk_doc_idx must be provided, not both. Consult documentation of bound method.zlEither both author2doc and doc2author should be provided, or neither. Consult documentation of bound method.g        c              3   2   K   | ]}j         |         V  d S r   r   r   s     r   r   z)AuthorTopicModel.bound.<locals>.<genexpr>  r   r   r   c              3       K   | ]	\  }}|V  
d S r   r$   )r   idr-   s      r   r   z)AuthorTopicModel.bound.<locals>.<genexpr>  s&      33eb!r333333r   r   c              3       K   | ]	\  }}|V  
d S r   r$   r   s      r   r   z)AuthorTopicModel.bound.<locals>.<genexpr>  r   r   r   zbound: at document #%i in chunkr?   r<   )rn   
get_lambdar   r   ro   r   r*   r+   rK   rR   r   r&   r   r   rO   rW   rL   r   r   logr   r   r`   rE   r   rV   r   )r   r   r   r   r*   r+   _lambdaElogbetarp   r   r/   	ElogthetaexpElogtheta
word_scoretheta_scorer,   r   r   r   r   r   r   
beta_scoresum_etatotal_scores   `                        r   r   zAuthorTopicModel.bound  sZ   T *''))(11fX&&
  	* 	JJ   !=    	
 	  __&& f f**1-- f$%deeef   =   9  
 *%00	vi((
&& 	] 	]FAs &q)$X$X$X$XPV@W$X$X$X`cdddI+33s3333c#hhOOOC+55555SCQQQC4>!Q& C>BBB **<	111+E{STSTSTVYSYGZ[[G"&s9~~!566SACGGBFSZOOD\D\\\JJ 	o%
 "" 	V 	VAq!A264:ad#;yAAA"NOOOK26'%111+"6"69L9L"LMMMK726$*#5#566aQRQRQRdATAT9U9UUUKK 	t'#j//99 
bfdh0H<===
bfWW--0A0AABBB
&""bfWW--w8J8J0K0KKLLL
 ;.;r   c                      t          d          )a  Override :meth:`~gensim.models.ldamodel.LdaModel.get_document_topics` and simply raises an exception.

        Warnings
        --------
        This method invalid for model, use :meth:`~gensim.models.atmodel.AuthorTopicModel.get_author_topics` or
        :meth:`~gensim.models.atmodel.AuthorTopicModel.get_new_author_topics` instead.

        Raises
        ------
        NotImplementedError
            Always.

        ziMethod "get_document_topics" is not valid for the author-topic model. Use the "get_author_topics" method.)NotImplementedError)r   word_idrZ   s      r   get_document_topicsz$AuthorTopicModel.get_document_topics	  s     "2
 
 	
r   c                 H     fd} fd}	 t          |          }nC# t          $ r6 t                              d           t	          d |D                       }Y nw xY w|dk    rt          d          dt          t           j         j        |z                       d} j	        } j
        v rt          d	          | j
        <    j        |<    j        <   D ]}g j        |<    j                            d
d| j        f          }	t#          j         j        j        |	g           j        _        	                      | j         j         |            d          \  }
}                     |          } |             n#  |             w xY w|S )a5  Infers topics for new author.

        Infers a topic distribution for a new author over the passed corpus of docs,
        assuming that all documents are from this single new author.

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            Corpus in BoW format.
        minimum_probability : float, optional
            Ignore topics with probability below this value, if None - 1e-8 is used.

        Returns
        -------
        list of (int, float)
            Topic distribution for the given `corpus`.

        c                  F    t           j        dz   dz    j                   S )Nr<   )r   rY   rX   rv   s   r   r   z3AuthorTopicModel.get_new_author_topics.<locals>.rho/  s"    t{Q*TZK888r   c                      j         j        dd         j         _        j        = j                 } j        | = j        = D ]
}j        |= d S )Nr   )rn   r   r*   r`   ra   r+   )r   
new_doc_idcorpus_doc_idxnew_author_namer   s     r   rollback_new_author_chageszJAuthorTopicModel.get_new_author_topics.<locals>.rollback_new_author_chages2  sk    #z/"5DJ0>/2Dt$/, 0 0
OJ//0 0r   r   c              3      K   | ]}d V  dS r   r$   r   s     r   r   z9AuthorTopicModel.get_new_author_topics.<locals>.<genexpr>A  s"      "5"51"5"5"5"5"5"5r   r   zDAuthorTopicModel.get_new_author_topics() called with an empty corpusplaceholder_namer<   z4self.author2id already has 'placeholder_name' authorrF   rC   Fr   )rO   r   rL   rM   r   rK   r}   r   r\   rV   r`   ra   r*   r+   rk   r   rU   r   r   rn   r   get_author_topics)r   r)   rZ   r   r  r   r   	author_idr  r   r   r-   new_author_topicsr  r  s   `            @@r   get_new_author_topicsz&AuthorTopicModel.get_new_author_topics  s   &	9 	9 	9 	9 	9		0 		0 		0 		0 		0 		0 		0	6"6{{ 	6 	6 	6NNQRRR""5"5f"5"5"555	6 q  	ecddd,eDOT_GW5WXXYY $	dn, 	USTTT*3'$3y! ,:(( 	< 	<J+:*;DOJ''%++D)ot=_``	9dj&6	%BCC
	)##%%$N '  IFA !% 6 6H[ \ \&&((((&&((((  s   ! =A! A!AF Fc                     | j         |         }| j        t          d          | j        j        |ddf         t          | j        j        |ddf                   z  }fdt          |          D             }|S )a  Get topic distribution the given author.

        Parameters
        ----------
        author_name : str
            Name of the author for which the topic distribution needs to be estimated.
        minimum_probability : float, optional
            Sets the minimum probability value for showing the topics of a given author, topics with probability <
            `minimum_probability` will be ignored.

        Returns
        -------
        list of (int, float)
            Topic distribution of an author.

        Example
        -------
        .. sourcecode:: pycon

            >>> from gensim.models import AuthorTopicModel
            >>> from gensim.corpora import mmcorpus
            >>> from gensim.test.utils import common_dictionary, datapath, temporary_file

            >>> author2doc = {
            ...     'john': [0, 1, 2, 3, 4, 5, 6],
            ...     'jane': [2, 3, 4, 5, 6, 7, 8],
            ...     'jack': [0, 2, 4, 6, 8]
            ... }
            >>>
            >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
            >>>
            >>> with temporary_file("serialized") as s_path:
            ...     model = AuthorTopicModel(
            ...         corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
            ...         serialized=True, serialization_path=s_path
            ...     )
            ...
            ...     model.update(corpus, author2doc)  # update the author-topic model with additional documents
            >>>
            >>> # construct vectors for authors
            >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

        Ng:0yE>c                 *    g | ]\  }}|k    ||fS r$   r$   )r   topicid
topicvaluerZ   s      r   r   z6AuthorTopicModel.get_author_topics.<locals>.<listcomp>  s>     
 
 
&9gz00
j!
 
 
r   )r`   rZ   rQ   rn   r   r   r&   )r   author_namerZ   r  
topic_distauthor_topicss     `   r   r  z"AuthorTopicModel.get_author_topicsd  s    X N;/	 	;"&":!"5t<<Z%il3c$*:J9VWVWVW<:X6Y6YY

 
 
 
=Fz=R=R
 
 

 r   c                     t          |t                    r2g }|D ],}|                    |                     ||                     -n|                     ||          }|S )aP  Get topic distribution for input `author_names`.

        Parameters
        ----------
        author_names : {str, list of str}
            Name(s) of the author for which the topic distribution needs to be estimated.
        eps : float, optional
            The minimum probability value for showing the topics of a given author, topics with probability < `eps`
            will be ignored.

        Returns
        -------
        list of (int, float) **or** list of list of (int, float)
            Topic distribution for the author(s), type depends on type of `author_names`.

        )rZ   )r{   r}   r(   r  )r   author_namesepsr'   r/   s        r   __getitem__zAuthorTopicModel.__getitem__  sz    " lD)) 	RE! Q QT33A33OOPPPPQ **<S*QQEr   )Nr:   NNNr;   r<   r=   r>   r?   r@   r@   r<   rA   rB   FNrC   N)FN)NN)NNNNNNNNNNNF)Nr?   NNr   )r    r!   r"   r#   r   rw   rd   r   r   r   r   r   rq   r   r  r  r  r'  r$   r   r   r9   r9      s]       aa^bLOPRMQ8<	_S _S _S _SB
u 
u 
u  #' #' #'J  *O# O# O# O#b" " " "H   > hlKO5:Z Z Z Zx} } } }~
 
 
 
&F! F! F! F!P9 9 9 9v     r   r9   )"r#   logging	itertoolsr   r   r   shutilr   os.pathr   osr   numpyr   scipy.specialr   gensimr	   gensim.modelsr
   gensim.models.ldamodelr   gensim.matutilsr   r   gensim.corporar   	getLoggerr    rL   r   r1   r7   r9   r$   r   r   <module>r5     s  / /l                                    ! ! ! ! ! !       " " " " " " + + + + + + K K K K K K K K # # # # # #		8	$	$         x      .  4  <V V V V Vx V V V V Vr   