
    c                        d Z ddlZddlZddlmZmZmZ ddlZddlm	Z	m
Z
mZ ddlZddlmZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ  ej        e          Zd
Z ej        ej                  j         Z!e G d d                      Z"e G d d                      Z#d Z$d Z%d Z&d Z'd Z(d Z)d"dZ*d"dZ+d Z,d Z-d"dZ.d Z/d Z0d Z1d Z2 G d de          Z3 G d  d!          Z4dS )#a  Ensemble Latent Dirichlet Allocation (eLDA), an algorithm for extracting reliable topics.

The aim of topic modelling is to find a set of topics that represent the global structure of a corpus of documents. One
issue that occurs with topics extracted from an NMF or LDA model is reproducibility. That is, if the topic model is
trained repeatedly allowing only the random seed to change, would the same (or similar) topic representation be reliably
learned. Unreliable topics are undesireable because they are not a good representation of the corpus.

Ensemble LDA addresses the issue by training an ensemble of topic models and throwing out topics that do not reoccur
across the ensemble. In this regard, the topics extracted are more reliable and there is the added benefit over many
topic models that the user does not need to know the exact number of topics ahead of time.

For more information, see the :ref:`citation section <Citation>` below, watch our `Machine Learning Prague 2019 talk
<https://slideslive.com/38913528/solving-the-text-labeling-challenge-with-ensemblelda-and-active-learning?locale=cs>`_,
or view our `Machine Learning Summer School poster
<https://github.com/aloosley/ensembleLDA/blob/master/mlss/mlss_poster_v2.pdf>`_.

Usage examples
--------------

Train an ensemble of LdaModels using a Gensim corpus:

.. sourcecode:: pycon

    >>> from gensim.test.utils import common_texts
    >>> from gensim.corpora.dictionary import Dictionary
    >>> from gensim.models import EnsembleLda
    >>>
    >>> # Create a corpus from a list of texts
    >>> common_dictionary = Dictionary(common_texts)
    >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    >>>
    >>> # Train the model on the corpus. corpus has to be provided as a
    >>> # keyword argument, as they are passed through to the children.
    >>> elda = EnsembleLda(corpus=common_corpus, id2word=common_dictionary, num_topics=10, num_models=4)

Save a model to disk, or reload a pre-trained model:

.. sourcecode:: pycon

    >>> from gensim.test.utils import datapath
    >>>
    >>> # Save model to disk.
    >>> temp_file = datapath("model")
    >>> elda.save(temp_file)
    >>>
    >>> # Load a potentially pretrained model from disk.
    >>> elda = EnsembleLda.load(temp_file)

Query, the model using new, unseen documents:

.. sourcecode:: pycon

    >>> # Create a new corpus, made of previously unseen documents.
    >>> other_texts = [
    ...     ['computer', 'time', 'graph'],
    ...     ['survey', 'response', 'eps'],
    ...     ['human', 'system', 'computer']
    ... ]
    >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
    >>>
    >>> unseen_doc = other_corpus[0]
    >>> vector = elda[unseen_doc]  # get topic probability distribution for a document

Increase the ensemble size by adding a new model. Make sure it uses the same dictionary:

.. sourcecode:: pycon

    >>> from gensim.models import LdaModel
    >>> elda.add_model(LdaModel(common_corpus, id2word=common_dictionary, num_topics=10))
    >>> elda.recluster()
    >>> vector = elda[unseen_doc]

To optimize the ensemble for your specific case, the children can be clustered again using
different hyperparameters:

.. sourcecode:: pycon

    >>> elda.recluster(eps=0.2)

.. _Citation:

Citation
--------
BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis].
Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Supervised by Alex Loosley. Available from:
https://www.sezanzeb.de/machine_learning/ensemble_LDA/

    N)ProcessPipeProcessError)SetOptionalList)cosine)	dataclass)utils)ldamodelldamulticore	basemodel)SaveLoadg?c                   |    e Zd ZU eed<   ee         ed<   ee         ed<   ee         ed<   eed<   ee         ed<   dS )Topicis_coreneighboring_labelsneighboring_topic_indiceslabelnum_neighboring_labelsvalid_neighboring_labelsN)__name__
__module____qualname__bool__annotations__r   intr        9lib/python3.11/site-packages/gensim/models/ensemblelda.pyr   r   {   sg         MMMC   "3x'''C=!#h&&&&&r   r   c                   P    e Zd ZU eed<   eee                  ed<   eed<   eed<   dS )Clustermax_num_neighboring_labelsr   r   	num_coresN)r   r   r   r   r   r   r   r   r   r    r"   r"      sB          ####SX&&&JJJNNNNNr   r"   c                 2    | j         o| j        | j        hk    S )zCheck if the topic is a valid core, i.e. no neighboring valid cluster is overlapping with it.

    Parameters
    ----------
    topic : :class:`Topic`
        topic to validate

    )r   r   r   )topics    r    _is_valid_corer'      s     =Ne<MNr   c                 V    |D ]%}|j         D ]}| |v r|                    |            &dS )zTRemove a label from every set in "neighboring_labels" for each core in ``clusters``.N)r   remove)r   clustersclusterneighboring_labels_sets       r    _remove_from_all_setsr-      sV     5 5&-&@ 	5 	5".. 5&--e444	55 5r   c                 L     t           fd|j        D                       |k    S )zYCheck if the cluster has at least ``min_cores`` of cores that belong to no other cluster.c                     g | ]	}|hk    
S r   r   ).0r   r   s     r    
<listcomp>z,_contains_isolated_cores.<locals>.<listcomp>   s"    ccc2D"ug-cccr   )sumr   )r   r+   	min_coress   `  r    _contains_isolated_coresr4      s/    ccccHbcccddhqqqr   c                    g }|                                  D ]\  }}d}g }|D ]1}t          |j        |          }|                    |j                   2d |D             }|                    t          |||t          d |D                                            t                              dt          |                     |S )a  Aggregate the labeled topics to a list of clusters.

    Parameters
    ----------
    grouped_by_labels : dict of (int, list of :class:`Topic`)
        The return value of _group_by_labels. A mapping of the label to a list of each topic which belongs to the
        label.

    Returns
    -------
    list of :class:`Cluster`
        It is sorted by max_num_neighboring_labels in descending order. There is one single element for each cluster.

    r   c                 8    g | ]}t          |          d k    |S )r   )len)r0   xs     r    r1   z%_aggregate_topics.<locals>.<listcomp>   s'    JJJAs1vvzJaJJJr   c                      g | ]}|j         	|S r   )r   r0   r&   s     r    r1   z%_aggregate_topics.<locals>.<listcomp>   s    FFFUF5FFFr   )r#   r   r   r$   zfound %s clusters)	itemsmaxr   appendr   r"   r7   loggerinfo)grouped_by_labelsr*   r   topicsr#   r   r&   s          r    _aggregate_topicsrB      s     H*0022  v%&" 	@ 	@E),U-IKe)f)f&%%e&>????JJ);JJJ'A1FFfFFFGG	
 
 
 	 	 	 	 KK#S]]333Or   c                     i }| D ]M}|j         rDt          |j                  |_        |j        }||vrg ||<   ||                             |           N|S )aQ  Group all the learned cores by their label, which was assigned in the cluster_model.

    Parameters
    ----------
    cbdbscan_topics : list of :class:`Topic`
        A list of topic data resulting from fitting a :class:`~CBDBSCAN` object.
        After calling .fit on a CBDBSCAN model, the results can be retrieved from it by accessing the .results
        member, which can be used as the argument to this function. It is a list of infos gathered during
        the clustering step and each element in the list corresponds to a single topic.

    Returns
    -------
    dict of (int, list of :class:`Topic`)
        A mapping of the label to a list of topics that belong to that particular label. Also adds
        a new member to each topic called num_neighboring_labels, which is the number of
        neighboring_labels of that topic.

    )r   r7   r   r   r   r=   )cbdbscan_topicsr@   r&   r   s       r    _group_by_labelsrE      sv    &   3 3= 	3+.u/G+H+HE(KE-- .+-!%(e$++E222r   c                     | D ]-\  }}|                                  |                                  .|D ]+}|                                r|                                 ~,dS )a4  Close pipes and terminate processes.

    Parameters
    ----------
        pipes : {list of :class:`multiprocessing.Pipe`}
            list of pipes that the processes use to communicate with the parent
        processes : {list of :class:`multiprocessing.Process`}
            list of worker processes
    N)closeis_alive	terminate)pipes	processesiparent_conn
child_connprocesss         r    	_teardownrP      s     $)  Z   	 G r   c                     |d}t          j        |           ddd         }|                                |k     }||         d         }| |k    S )z3Original masking method. Returns a new binary mask.Ngffffff?)npsortcumsum)a	thresholdsorted_alargest_masssmallest_valids        r    mass_maskingr[     sU     	wqzz$$B$H??$$y0Ll+B/Nr   c                     |d}| t          j        |           ddd         t          t          |           |z                     k    S )z1Faster masking method. Returns a new binary mask.Ng)\(?rR   )rS   rT   r   r7   )rV   rW   s     r    rank_maskingr]     sE     	rwqzz$$B$CFFY$6 7 7888r   c                 6   d }t          | |d          }|D ]0}d|_        |j        |k     rd|_        t          |j        |           1d |D             D ]9}|j        }t          |||          rd|_        "d|_        t          ||           :d |D             S )z`Check which clusters from the cbdbscan step are significant enough. is_valid is set accordingly.c                 *    | j         | j        | j        fS N)r#   r$   r   )r+   s    r    _cluster_sort_keyz-_validate_clusters.<locals>._cluster_sort_key  s    173DgmSSr   F)keyreverseNc                      g | ]}|j         	|S r`   is_validr0   r+   s     r    r1   z&_validate_clusters.<locals>.<listcomp>'  s     WWWg>NWGWWWr   Tc                      g | ]}|j         	|S r   re   rg   s     r    r1   z&_validate_clusters.<locals>.<listcomp>/  s     GGGg6FGGGGGr   )sortedrf   r$   r-   r   r4   )r*   r3   ra   sorted_clustersr+   r   s         r    _validate_clustersrk     s    
T T T X+<eLLLO" B By( 	B$G!'-AAA XW?WWW : :#E7I>> 	:#G$G!%9999GG?GGGGr   c                      fdt          |          D             }t          ||          }g }g }|}t          |          D ]}t                      \  }	}
d}||dz
  k    r|}nt          |||z
  z            }|| d         d|         } |||
f}	 t	          t
          |          }|                    |           |                    |	|
f           |                                 ||z  }# t          $ r/ t          
                    d|            t          ||            w xY w|D ]\  }	}|	                                }|	                                  j        s/ xj        |z  c_        t!          j        d |D                       }n|}t!          j         j        |g           _        |D ]}|                                 dS )a  Generate the topic models to form the ensemble in a multiprocessed way.

    Depending on the used topic model this can result in a speedup.

    Parameters
    ----------
    ensemble: EnsembleLda
        the ensemble
    num_models : int
        how many models to train in the ensemble
    ensemble_workers : int
        into how many processes to split the models will be set to max(workers, num_models), to avoid workers that
        are supposed to train 0 models.

        to get maximum performance, set to the number of your cores, if non-parallelized models are being used in
        the ensemble (LdaModel).

        For LdaMulticore, the performance gain is small and gets larger for a significantly smaller corpus.
        In that case, ensemble_workers=2 can be used.

    c                 N    g | ]!}j                             t                    "S r   random_staterandint_MAX_RANDOM_STATEr0   _ensembles     r    r1   z4_generate_topic_models_multiproc.<locals>.<listcomp>K  s+    aaa!X*223DEEaaar   r      Ntargetargscould not start process c                 6    g | ]}|                                 S r   
get_topics)r0   ms     r    r1   z4_generate_topic_models_multiproc.<locals>.<listcomp>y  s     "B"B"Ba1<<>>"B"B"Br   )rangeminr   r   r   _generate_topic_models_workerr=   startr   r>   errorrP   recvrG   memory_friendly_ttdatmsrS   concatenatettdarI   )rt   
num_modelsensemble_workersrandom_statesworkersrK   rJ   num_models_unhandledrL   rM   rN   num_subprocess_modelsrandom_states_for_workerrx   rO   rs   answerr   s   `                 r     _generate_topic_models_multiprocr   2  sE   2 baaauU_O`O`aaaM "J//G
 IE%7^^  "&&&Z !! 	N %9!!$'(<!(L$M$M! $12F1F1G1G#HI_J_I_#` /1I:V		%BNNNGW%%%LL+z2333MMOOO $99   	 	 	LL7A77888eY'''	   	> 	>Q!!##, 	LLF"LL>"B"B6"B"B"BCCDDDt'<==   s   AC..9D'c                     | fdt          |          D             }t          |          |k    sJ  j                                        }d}t          |          D ]q}||         |d<                                     di |}t          j         j        |                                g           _         j	        s xj
        |gz  c_
        r|j        j                                         _        |j         _        dS )af  Train the topic models that form the ensemble.

    Parameters
    ----------
    ensemble: EnsembleLda
        the ensemble
    num_models : int
        number of models to be generated
    random_states : list
        list of numbers or np.random.RandomState objects. Will be autogenerated based on the ensembles
        RandomState if None (default).
    Nc                 N    g | ]!}j                             t                    "S r   rn   rr   s     r    r1   z*_generate_topic_models.<locals>.<listcomp>  s+    eeea.667HIIeeer   ro   r   )r~   r7   gensim_kw_argscopyget_topic_model_classrS   r   r   r|   r   r   statesstatsr2   
sstats_sumeta)rt   r   r   kwargstmrL   s   `     r    _generate_topic_modelsr     s     feeeeSXYcSdSdeee}++++$))++F	B : ! !!.q!1~-X++--7777 r}}'GHH , 	!LLRD LL (/--//H6HLLLr   c                    t                               d| d           t          | ||           | j        r|                    | j                   n|                    | j                   |                                 dS )zzWrapper for _generate_topic_models to write the results into a pipe.

    This is intended to be used inside a subprocess.zspawned worker to generate z topic models)rt   r   r   N)r>   r?   r   r   sendr   r   rG   )rt   r   r   pipes       r    r   r     s     KKGjGGGHHHHS`aaaa $   			(-    		(,JJLLLLLr   c                    t          j        t          |           t          |          f          }| j        d         dk    r|j        d         dk    rd}t	          |           D ]\  }} |||          }	||	         }
||	                                z  }t	          |          D ]]\  }}||z   |k    rd||         |<   ||	         }|                                t          k    rd}nt          |
|          }|||         |<   ^t          d|z  | j        d         z  | j        d         z  d          }t          
                    d| d| d           |S )a  Calculate an (asymmetric) distance from each topic in ``ttda1`` to each topic in ``ttda2``.

    Parameters
    ----------
    ttda1 and ttda2: 2D arrays of floats
        Two ttda matrices that are going to be used for distance calculation. Each row in ttda corresponds to one
        topic. Each cell in the resulting matrix corresponds to the distance between a topic pair.
    start_index : int
        this function might be used in multiprocessing, so start_index has to be set as ttda1 is a chunk of the
        complete ttda in that case. start_index would be 0 if ``ttda1 == self.ttda``. When self.ttda is split into
        two pieces, each 100 ttdas long, then start_index should be be 100. default is 0
    masking_method: function

    masking_threshold: float

    Returns
    -------
    2D numpy.ndarray of floats
        Asymmetric distance matrix of size ``len(ttda1)`` by ``len(ttda2)``.

    r   ru   d   zthe given threshold of z covered on average z% of tokens)rS   ndarrayr7   shape	enumerater2   &_COSINE_DISTANCE_CALCULATION_THRESHOLDr	   roundr>   r?   )ttda1ttda2start_indexmasking_methodmasking_threshold	distancesavg_mask_sizettd1_idxttd1maskttd1_maskedttd2_idxttd2ttd2_maskeddistancepercents                   r    +_calculate_asymmetric_distance_matrix_chunkr     s   : 
CJJE

344I{1~ "kek!nq0 "k  (.. 	9 	9NHd!>$(9::Dt*KTXXZZ'M #,E"2"2 9 9$k)X5 45Ih'1 #4j ??$$(NN @ HH%k;??H08	(#H--!9$ m+ek!n<u{1~MqQQi.?iiU\iiijjjr   c                     t                               d|  d| d           ||||z            }t          |||||          }|                    | |f           |                                 dS )zKWorker that computes the distance to all other nodes from a chunk of nodes.zspawned worker z to generate z' rows of the asymmetric distance matrixr   r   r   r   r   N)r>   r?   r   r   rG   )		worker_identire_ttda
ttdas_sentn_ttdasr   r   r   r   distance_chunks	            r    "_asymmetric_distance_matrix_workerr   	  s     KKj)jj'jjjkkk
:#778E@%+  N 	IIy.)***JJLLLLLr   c                 
   g }g }d}t          |           D ]}	 t                      \  }}	d}
|| dz
  k    rt          |          |z
  }
n%t          t          |          |z
  | |z
  z            }
||||
|||	f}t	          t
          |          }||
z  }|                    |           |                    ||	f           |                                 # t          $ r/ t          
                    d|            t          ||            w xY wg }|D ]E\  }}|                                \  }}|                                 |                    |           F|D ]}|                                 t          j        |          S )Nr   ru   rv   ry   )r~   r   r7   r   r   r   r=   r   r   r>   r   rP   r   rG   rI   rS   r   )r   r   r   r   rK   rJ   r   rL   rM   rN   r   rx   rO   r   rs   r   r   s                    r    /_calculate_assymetric_distance_matrix_multiprocr   !  s    IEJ7^^  	&*ff#K GGaK O k**Z7s;//*<1MNN{JIZ\fgD%GdSSSG'!JW%%%LL+z2333MMOOOO 	 	 	LL7A77888eY'''	
 I   ) )Q$/$4$4$6$6!	>((((  >)$$$s   B8C9Dc                        e Zd ZdZdddddddedddfdZd	 Z fd
Zej        j        e_        d Z	d Z
ddZd ZddZddZddZd Zd Zd Zd Zd Zd Zed             Z xZS )EnsembleLdaaA  Ensemble Latent Dirichlet Allocation (eLDA), a method of training a topic model ensemble.

    Extracts stable topics that are consistently learned across multiple LDA models. eLDA has the added benefit that
    the user does not need to know the exact number of topics the topic model should extract ahead of time.

    r      N皙?ru   Tc                    d|vrd|d<   d|vrd|d<   |d         ?|d         7t                               d           t          j        |d                   |d<   |d         |d         t	          d          t          |          t
          k    r"t          |t          j                  r|| _	        n9t          j        t          j        d}||vrt	          d          ||         | _	        || _        || _        || _        |
| _        |	| _        || _        d| _        t          j        |          | _        d| _        d| _        g | _        t1          j        dt5          |d                   f          | _        d	| _        |dk    rdS |                    d          dS d
|v r|d
         dk    rdS d|v r|d         dk    rdS t                               d| d| d           |dk    rt?          | ||           ntA          | |           | !                                 | "                    ||           | #                    |           | $                                 dS )a-  Create and train a new EnsembleLda model.

        Will start training immediatelly, except if iterations, passes or num_models is 0 or if the corpus is missing.

        Parameters
        ----------
        topic_model_class : str, topic model, optional
            Examples:
                * 'ldamulticore' (default, recommended)
                * 'lda'
                * ldamodel.LdaModel
                * ldamulticore.LdaMulticore
        ensemble_workers : int, optional
            Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
            num_models should be a multiple of ensemble_workers.

            Setting it to 0 or 1 will both use the non-multiprocessing version. Default: 1
        num_models : int, optional
            How many LDA models to train in this ensemble.
            Default: 3
        min_cores : int, optional
            Minimum cores a cluster of topics has to contain so that it is recognized as stable topic.
        epsilon : float, optional
            Defaults to 0.1. Epsilon for the CBDBSCAN clustering that generates the stable topics.
        ensemble_workers : int, optional
            Spawns that many processes and distributes the models from the ensemble to those as evenly as possible.
            num_models should be a multiple of ensemble_workers.

            Setting it to 0 or 1 will both use the nonmultiprocessing version. Default: 1
        memory_friendly_ttda : boolean, optional
            If True, the models in the ensemble are deleted after training and only a concatenation of each model's
            topic term distribution (called ttda) is kept to save memory.

            Defaults to True. When False, trained models are stored in a list in self.tms, and no models that are not
            of a gensim model type can be added to this ensemble using the add_model function.

            If False, any topic term matrix can be suplied to add_model.
        min_samples : int, optional
            Required int of nearby topics for a topic to be considered as 'core' in the CBDBSCAN clustering.
        masking_method : function, optional
            Choose one of :meth:`~gensim.models.ensemblelda.mass_masking` (default) or
            :meth:`~gensim.models.ensemblelda.rank_masking` (percentile, faster).

            For clustering, distances between topic-term distributions are asymmetric.  In particular, the distance
            (technically a divergence) from distribution A to B is more of a measure of if A is contained in B.  At a
            high level, this involves using distribution A to mask distribution B and then calculating the cosine
            distance between the two.  The masking can be done in two ways:

            1. mass: forms mask by taking the top ranked terms until their cumulative mass reaches the
            'masking_threshold'

            2. rank: forms mask by taking the top ranked terms (by mass) until the 'masking_threshold' is reached.
            For example, a ranking threshold of 0.11 means the top 0.11 terms by weight are used to form a mask.
        masking_threshold : float, optional
            Default: None, which uses ``0.95`` for "mass", and ``0.11`` for masking_method "rank".  In general, too
            small a mask threshold leads to inaccurate calculations (no signal) and too big a mask leads to noisy
            distance calculations.  Defaults are often a good sweet spot for this hyperparameter.
        distance_workers : int, optional
            When ``distance_workers`` is ``None``, it defaults to ``os.cpu_count()`` for maximum performance. Default is
            1, which is not multiprocessed. Set to ``> 1`` to enable multiprocessing.
        **gensim_kw_args
            Parameters for each gensim model (e.g. :py:class:`gensim.models.LdaModel`) in the ensemble.

        id2wordNcorpuszHno word id mapping provided; initializing from corpus, assuming identityzat least one of corpus/id2word must be specified, to establish input space dimensionality. Corpus should be provided using the `corpus` keyword argument.)ldar   z\topic_model_class should be one of 'lda', 'ldamulticode' or a model inheriting from LdaModelr   T
iterationspasseszgenerating z topic models using z workersru   )%r>   warningr   dict_from_corpus
ValueErrortype
issubclassr   LdaModeltopic_model_classr   LdaMulticorer   r   r   distance_workersr   r   classic_model_representationget_random_statero   r   r   r   rS   emptyr7   r   #asymmetric_distance_matrix_outdatedgetr?   r   r   $_generate_asymmetric_distance_matrix_generate_topic_clusters_generate_stable_topicsgenerate_gensim_representation)selfr   r   r3   epsilonr   r   min_samplesr   r   r   ro   r   kindss                 r    __init__zEnsembleLda.__init__[  s   P N* 	-(,N9%>) 	,'+N8$)$ 	Y^H5M 	YNNefff(-(>~h?W(X(XN9%)$ 	1I 	-   !""d* 	>z:KXM^/_/_ 	>%6D""  ( , 9 E !-  /   &++<%=D"$,$8! 0!2, -1) "2<@@Ha^I%>!?!?@AA	370 ? 	Fh'' 	F>) 	n\.Ja.O 	F~% 	.*Ba*G 	F\*\\BR\\\]]]a 	5,T:?OPPPP"444411333%%g{;;;$$Y/// 	++-----r   c           	      ~   | j         d}	 t          j        | j                  }t	          || j                  | _         | `| `nu# t          $ r0 t                              d| j         d| j         d|            Y n<t          $ r0 t                              d| j         d| j         d|            Y nw xY w| j         S )z`Get the class that is used for :meth:`gensim.models.EnsembleLda.generate_gensim_representation`.NzfTry setting topic_model_class manually to what the individual models were based on, e.g. LdaMulticore.zCould not import the "z"" module in order to provide the "z*" class as "topic_model_class" attribute. z" class from the "z<" module in order to set the "topic_model_class" attribute. )
r   	importlibimport_moduletopic_model_module_stringgetattrtopic_model_class_stringModuleNotFoundErrorr>   r   AttributeError)r   instructionmodules      r    r   z!EnsembleLda.get_topic_model_class  s6   ! 	% "01OPP)09V)W)W&211&   oT-J o o5o oalo o     "   %T-J % %6% %"% %     %%s   7A 7B5<6B54B5c                 &   |                                  "| j        j        | _        | j        j        | _        t          |                    dd                                        d          |d<    t          t          |           j        |i | d S )Nignorer   )r   )r   r   r   r   r   r   	frozensetr   unionsuperr   save)r   rx   r   	__class__s      r    r   zEnsembleLda.save  s    %%'' 	L-1-C-ND*,0,B,KD)$VZZ"%=%=>>DDE\]]x%k4  %t6v66666r   c                 "    g | _         d| _        dS )zRemove the stored gensim models and only keep their ttdas.

        This frees up memory, but you won't have access to the individual  models anymore if you intended to use them
        outside of the ensemble.
        TN)r   r   r   s    r    convert_to_memory_friendlyz&EnsembleLda.convert_to_memory_friendly  s     $(!!!r   c                    t                               d           | j        }|dk    r=d| j        v r4| j        d         '| j        d         D ]}|D ]}||d         z  }|| _        |                                 }t          |          }|dk    r#t                               d           d| _        dS | j                                        }| j	        |d<   ||d<   d|d	<    | 
                                di |}|j	        }|dk    r%|j        j                                        }|| _        d}	t          |t          t           f          r|t          |d                   z  g|z  }	n{t          |j                  dk    r|                                gg|z  }	t          |j                  dk    r2t%          j        |                    d
          dddf                   }	t%          j        ||z  gg|z            |	z   }
||
z  }||z  }|                    t$          j                  |j        _        |                                 || _        |S )a  Create a gensim model from the stable topics.

        The returned representation is an Gensim LdaModel (:py:class:`gensim.models.LdaModel`) that has been
        instantiated with an A-priori belief on word probability, eta, that represents the topic-term distributions of
        any stable topics the were found by clustering over the ensemble of topic distributions.

        When no stable topics have been detected, None is returned.

        Returns
        -------
        :py:class:`gensim.models.LdaModel`
            A Gensim LDA Model classic_model_representation for which:
            ``classic_model_representation.get_topics() == self.get_topics()``

        zQgenerating classic gensim model representation based on results from the ensembler   r   Nru   z\the model did not detect any stable topic. You can try to adjust epsilon: recluster(eps=...)r   
num_topicsr   axisr   )r>   r?   r   r   r|   r7   r   r   r   r   r   r   r   r2   
isinstancer   floatr   rS   arrayastypefloat32
sync_state)r   r   documenttokenstable_topicsnum_stable_topicsparamsr   r   eta_sumnormalization_factorr   s               r    r   z*EnsembleLda.generate_gensim_representation  s     	ghhh_
 ? 	)x4+>> 	)tGZ[cGd 	) /9 + +% + +E%(*JJ+(DO))..! 	LL%   15D-F $))++u0|x (Dt'A'A'C'C'M'Mf'M'M$ +.? 	)5;BFFHHJ(DO
 cC<(( 	=Sq!122236GGGG39~~" <GGII;-*;;39~~! =(3777??111d7#;<<  "x*7H*H)I(JM^(^__bii!55#4:MM"*4M4M$*1$//111,H)++r   c                    t          |t          j        t          f          st          j        |g          }n)t          j        |          }t          |          dk    sJ | j        r9d}g }t          |j                                        t          j	        t          f          r|}d}nt          |d         t          |                     r:t          j        d |D             d          }t          d |D                       }nut          |d         t          j                  r0t          j        d |D             d          }t          |          }n%t          dt          |d                              || xj        |z  c_        n| xj        |z  c_        ng }t          |j                                        t          j	        t          f          rt          d	          t          |d         t          |                     r;|D ]}| xj        |j        z  c_        t          j        d
 |D             d          }nt          |d         t          j                  rC| xj        |                                z  c_        t          j        d |D             d          }n%t          dt          |d                              |:|| j        z   t          | j                  k    rt&                              d           t          | j                  | _        t&                              d| j         dt          | j                   d           | j        j        d         |j        d         k    r1t          d| j        j        d          d|j        d          d          t          j        | j        |d          | _        d| _        dS )a  Add the topic term distribution array (ttda) of another model to the ensemble.

        This way, multiple topic models can be connected to an ensemble manually. Make sure that all the models use
        the exact same dictionary/idword mapping.

        In order to generate new stable topics afterwards, use:
            2. ``self.``:meth:`~gensim.models.ensemblelda.EnsembleLda.recluster`

        The ttda of another ensemble can also be used, in that case set ``num_new_models`` to the ``num_models``
        parameter of the ensemble, that means the number of classic models in the ensemble that generated the ttda.
        This is important, because that information is used to estimate "min_samples" for _generate_topic_clusters.

        If you trained this ensemble in the past with a certain Dictionary that you want to reuse for other
        models, you can get it from: ``self.id2word``.

        Parameters
        ----------
        target : {see description}
            1. A single EnsembleLda object
            2. List of EnsembleLda objects
            3. A single Gensim topic model (e.g. (:py:class:`gensim.models.LdaModel`)
            4. List of Gensim topic models

            if memory_friendly_ttda is True, target can also be:
            5. topic-term-distribution-array

            example: [[0.1, 0.1, 0.8], [...], ...]

            [topic1, topic2, ...]
            with topic being an array of probabilities:
            [token1, token2, ...]

            token probabilities in a single topic sum to one, therefore, all the words sum to len(ttda)

        num_new_models : integer, optional
            the model keeps track of how many models were used in this ensemble. Set higher if ttda contained topics
            from more than one model. Default: None, which takes care of it automatically.

            If target is a 2D-array of float values, it assumes 1.

            If the ensemble has ``memory_friendly_ttda`` set to False, then it will always use the number of models in
            the target parameter.

        r   ru   c                     g | ]	}|j         
S r   r   r0   rt   s     r    r1   z)EnsembleLda.add_model.<locals>.<listcomp>      &L&L&Lx}&L&L&Lr   r   c                     g | ]	}|j         
S r   )r   r  s     r    r1   z)EnsembleLda.add_model.<locals>.<listcomp>  s    *V*V*V88+>*V*V*Vr   c                 6    g | ]}|                                 S r   r{   r0   models     r    r1   z)EnsembleLda.add_model.<locals>.<listcomp>  $    &N&N&Neu'7'7'9'9&N&N&Nr   z6target is of unknown type or a list of unknown types: Nzttda arrays cannot be added to ensembles, for which memory_friendly_ttda=False, you can call convert_to_memory_friendly, but it will discard the stored gensim models and only keep the relevant topic term distributions from them.c                     g | ]	}|j         
S r   r
  r  s     r    r1   z)EnsembleLda.add_model.<locals>.<listcomp>  r  r   c                 6    g | ]}|                                 S r   r{   r  s     r    r1   z)EnsembleLda.add_model.<locals>.<listcomp>  r  r   ztnum_new_models will be ignored. num_models should match the number of stored models for a memory unfriendly ensemblezensemble contains z models and z topics nowz4target ttda dimensions do not match. Topics must be rR   z	 but was z elements largeT)r   rS   r   listr   r7   r   dtyper   numberr   r   r2   r   BaseTopicModelr   r   r   tolistr>   r?   r   r   r=   r   )r   rw   num_new_modelsdetected_num_modelsr   rt   s         r    	add_modelzEnsembleLda.add_modelr  s   d &2:t"455 	#Xvh''FFXf%%Fv;;?"""$ A	, #$D &,++--	5/ABB m&'## F1ItDzz22 m~&L&LV&L&L&LSTUUU&)*V*Vv*V*V*V&W&W## F1Iy'?@@ m~&N&Nv&N&N&NUVWWW&)&kk## !!kZ^_efg_hZiZi!k!klll  2#66>1 D &,++--	5/ABB m \   F1ItDzz22 m & - -HHH,HHH~&L&LV&L&L&LSTUUU F1Iy'?@@ mFMMOO+~&N&Nv&N&N&NUVWWW !!kZ^_efg_hZiZi!k!klll  nt.NRUVZV^R_R_._ E   "$(mmDOaaac$)nnaaabbb9?1A. 	"tyWYGZ " "eieopres " " "  
 IdiA666	 48000r   c                    | j         }d| _        t                              dt	          | j                   dt	          | j                   d           |5|dk    r/t          | j        | j        d| j        | j                  | _	        dS |t          j                    }t          || j        | j        | j        	          | _	        dS )
a  Calculate the pairwise distance matrix for all the ttdas from the ensemble.

        Returns the asymmetric pairwise distance matrix that is used in the DBSCAN clustering.

        Afterwards, the model needs to be reclustered for this generated matrix to take effect.

        Fzgenerating a z x z asymmetric distance matrix...Nru   r   r   )r   r   r   r   )r   r   r>   r?   r7   r   r   r   r   asymmetric_distance_matrixos	cpu_countr   )r   r   s     r    r   z0EnsembleLda._generate_asymmetric_distance_matrix  s     ' 490eC	NNees49~~eeefff 	7a< 	.Yii#2"&"8/ / /D+++  ),...] I#2"&"8	/ / /D+++r   c                    |3t          | j        dz            }t                              d|           nt                              d           t	          ||          | _        | j                            | j                   dS )a  Run the CBDBSCAN algorithm on all the detected topics and label them with label-indices.

        The final approval and generation of stable topics is done in ``_generate_stable_topics()``.

        Parameters
        ----------
        eps : float
            dbscan distance scale
        min_samples : int, optional
            defaults to ``int(self.num_models / 2)``, dbscan min neighbours threshold required to consider
            a topic to be a core. Should scale with the number of models, ``self.num_models``

        N   z6fitting the clustering model, using %s for min_sampleszfitting the clustering modelepsr   )r   r   r>   r?   CBDBSCANcluster_modelfitr  r   r#  r   s      r    r   z$EnsembleLda._generate_topic_clusters  s      	8do122KKKPR]^^^^KK6777%#;GGGt>?????r   c           
         |dk    rd}|Rt          dt          dt          | j        dz  dz                                 }t                              d|           nt                              d           | j        j        }t          |          }t          |          }t          ||          }d |D             |D ]}fd	|j        D             |_         t          j        t                    |          }| j        |         }t          j        d
 |D                       |         t          j                  }	t'          |	          }
t          j        |
t'          | j                  f          }t-          |	          D ]L\  }t          j        fdt-          |          D                       }|                    d          ||<   M|| _        || _        t                              dt'          |                     dS )a{  Generate stable topics out of the clusters.

        The function finds clusters of topics using a variant of DBScan.  If a cluster has enough core topics
        (c.f. parameter ``min_cores``), then this cluster represents a stable topic.  The stable topic is specifically
        calculated as the average over all topic-term distributions of the core topics in the cluster.

        This function is the last step that has to be done in the ensemble.  After this step is complete,
        Stable topics can be retrieved afterwards using the :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics`
        method.

        Parameters
        ----------
        min_cores : int
            Minimum number of core topics needed to form a cluster that represents a stable topic.
                Using ``None`` defaults to ``min_cores = min(3, max(1, int(self.num_models /4 +1)))``

        r   ru   Nr      z0generating stable topics, using %s for min_coreszgenerating stable topicsc                     h | ]	}|j         
S r   r   rg   s     r    	<setcomp>z6EnsembleLda._generate_stable_topics.<locals>.<setcomp>W  s    LLL'LLLr   c                     h | ]}|v |	S r   r   )r0   r   valid_cluster_labelss     r    r,  z6EnsembleLda._generate_stable_topics.<locals>.<setcomp>Z  s5     . . .00.. . .r   c                     g | ]	}|j         
S r   r+  r:   s     r    r1   z7EnsembleLda._generate_stable_topics.<locals>.<listcomp>b  s     J J J J J Jr   c                 2    g | ]\  }}|         k    |S r   r   )r0   tr&   r   topic_labelss      r    r1   z7EnsembleLda._generate_stable_topics.<locals>.<listcomp>k  s/    )t)t)tHAu[ghi[jns[s)t%)t)t)tr   r   zfound %s stable topics)r   r<   r   r   r>   r?   r%  resultsrE   rB   rk   r   r   rS   	vectorizer'   r   r   uniquer7   r   r   r   meanvalid_clustersr  )r   r3   rD   r@   r*   r7  r&   valid_core_maskvalid_topicsunique_labelsr  r  label_indextopics_of_clusterr   r2  r.  s                 @@@r    r   z#EnsembleLda._generate_stable_topics4  s)   ( > 	I 	4As1c$/A*=*A&B&BCCDDIKKJIVVVVKK2333,4,_==$%677+Hi@@LL^LLL$ 	 	E. . . .#(#;. . .E** 7",~66GGy1x J J/ J J JKKO\	,//.."3S5F5F!GHH #,M":": 	H 	HK ")t)t)t)t)t	,@W@W)t)t)t u u):)?)?Q)?)G)GM+&&,*,c-.@.@AAAAAr   c                     | j         r.t                              d           |                                  |                     ||           |                     |           |                                  dS )a]  Reapply CBDBSCAN clustering and stable topic generation.

        Stable topics can be retrieved using :meth:`~gensim.models.ensemblelda.EnsembleLda.get_topics`.

        Parameters
        ----------
        eps : float
            epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
            default: ``0.1``
        min_samples : int
            The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.
            default: ``int(self.num_models / 2)``
        min_cores : int
            how many cores a cluster has to have, to be treated as stable topic. That means, how many topics
            that look similar have to be present, so that the average topic in those is used as stable topic.
            default: ``min(3, max(1, int(self.num_models /4 +1)))``

        z7asymmetric distance matrix is outdated due to add_modelN)r   r>   r?   r   r   r   r   )r   r#  r   r3   s       r    	reclusterzEnsembleLda.reclusters  s|    ( 3 	8KKQRRR55777 	%%c;777 	$$Y/// 	++-----r   c                     | j         S )zReturn only the stable topics from the ensemble.

        Returns
        -------
        2D Numpy.numpy.ndarray of floats
            List of stable topic term distributions

        )r  r   s    r    r|   zEnsembleLda.get_topics  s     !!r   c                     | j         6t          | j                  dk    rt          d          t          d          dS )z[Check if stable topics and the internal gensim representation exist. Raise an error if not.Nr   zno stable topic was detectedz*use generate_gensim_representation() first)r   r7   r  r   r   s    r    _ensure_gensim_representationz)EnsembleLda._ensure_gensim_representation  sQ    , 	O4%&&!+ O !?@@@ !MNNN		O 	Or   c                 D    |                                   | j        |         S )z/See :meth:`gensim.models.LdaModel.__getitem__`.)rA  r   )r   rL   s     r    __getitem__zEnsembleLda.__getitem__  s"    **,,,033r   c                 N    |                                    | j        j        |i |S )z-See :meth:`gensim.models.LdaModel.inference`.)rA  r   	inferencer   posargsr   s      r    rE  zEnsembleLda.inference  s/    **,,,:t0:GNvNNNr   c                 N    |                                    | j        j        |i |S )z2See :meth:`gensim.models.LdaModel.log_perplexity`.)rA  r   log_perplexityrF  s      r    rI  zEnsembleLda.log_perplexity  s/    **,,,?t0?SFSSSr   c                 N    |                                    | j        j        |i |S )z0See :meth:`gensim.models.LdaModel.print_topics`.)rA  r   print_topicsrF  s      r    rK  zEnsembleLda.print_topics  s/    **,,,=t0=wQ&QQQr   c                     | j         d         S )zUReturn the :py:class:`gensim.corpora.dictionary.Dictionary` object used in the model.r   )r   r   s    r    r   zEnsembleLda.id2word  s     "9--r   r`   )r   N)r   NN)r   r   r   __doc__r[   r   r   r   r   r   r   r  r   r   r   r>  r|   rA  rC  rE  rI  rK  propertyr   __classcell__)r   s   @r    r   r   S  s         %3q!$\TTQ. Q. Q. Q.f& & &27 7 7 7 7 =(DL) ) )W, W, W,rF8 F8 F8 F8P! ! !F@ @ @ @.=B =B =B =B~. . . .H	" 	" 	"O O O4 4 4
O O O
T T T
R R R
 . . X. . . . .r   r   c                       e Zd ZdZd Zd ZdS )r$  a	  A Variation of the DBSCAN algorithm called Checkback DBSCAN (CBDBSCAN).

    The algorithm works based on DBSCAN-like parameters 'eps' and 'min_samples' that respectively define how far a
    "nearby" point is, and the minimum number of nearby points needed to label a candidate datapoint a core of a
    cluster. (See https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html).

    The algorithm works as follows:

    1. (A)symmetric distance matrix provided at fit-time (called 'amatrix').
       For the sake of example below, assume the there are only five topics (amatrix contains distances with dim 5x5),
       T_1, T_2, T_3, T_4, T_5:
    2. Start by scanning a candidate topic with respect to a parent topic
       (e.g. T_1 with respect to parent None)
    3. Check which topics are nearby the candidate topic using 'self.eps' as a threshold and call them neighbours
       (e.g. assume T_3, T_4, and T_5 are nearby and become neighbours)
    4. If there are more neighbours than 'self.min_samples', the candidate topic becomes a core candidate for a cluster
       (e.g. if 'min_samples'=1, then T_1 becomes the first core of a cluster)
    5. If candidate is a core, CheckBack (CB) to find the fraction of neighbours that are either the parent or the
       parent's neighbours.  If this fraction is more than 75%, give the candidate the same label as its parent.
       (e.g. in the trivial case there is no parent (or neighbours of that parent), a new incremental label is given)
    6. If candidate is a core, recursively scan the next nearby topic (e.g. scan T_3) labeling the previous topic as
       the parent and the previous neighbours as the parent_neighbours - repeat steps 2-6:

       2. (e.g. Scan candidate T_3 with respect to parent T_1 that has parent_neighbours T_3, T_4, and T_5)
       3. (e.g. T5 is the only neighbour)
       4. (e.g. number of neighbours is 1, therefore candidate T_3 becomes a core)
       5. (e.g. CheckBack finds that two of the four parent and parent neighbours are neighbours of candidate T_3.
          Therefore the candidate T_3 does NOT get the same label as its parent T_1)
       6. (e.g. Scan candidate T_5 with respect to parent T_3 that has parent_neighbours T_5)

    The CB step has the effect that it enforces cluster compactness and allows the model to avoid creating clusters for
    unstable topics made of a composition of multiple stable topics.

    c                 "    || _         || _        dS )a  Create a new CBDBSCAN object. Call fit in order to train it on an asymmetric distance matrix.

        Parameters
        ----------
        eps : float
            epsilon for the CBDBSCAN algorithm, having the same meaning as in classic DBSCAN clustering.
        min_samples : int
            The minimum number of samples in the neighborhood of a topic to be considered a core in CBDBSCAN.

        Nr"  r'  s      r    r   zCBDBSCAN.__init__  s     &r   c                      d _         d t          t          |                    D             |                                t	          j        d           d t                              d                    D             }t          |d           }d |D             d fd
	t                    dk    r3	                    d          } |           t                    dk    3 _
        d	S )z5Apply the algorithm to an asymmetric distance matrix.r   c                     g | ];}t          d t                      t                      ddt                                <S )FNr   )r   r   r   r   r   r   )r   set)r0   rL   s     r    r1   z CBDBSCAN.fit.<locals>.<listcomp>  sW     	$
 	$
 	$
  #&55*-%%'(),  	$
 	$
 	$
r   ru   c                     g | ]	\  }}||f
S r   r   r0   indexr   s      r    r1   z CBDBSCAN.fit.<locals>.<listcomp>
  s!    !o!o!ox8U"3!o!o!or   r   c                     | d         S Nr   r   )r   s    r    <lambda>zCBDBSCAN.fit.<locals>.<lambda>  s    \def\g r   rb   c                     g | ]\  }}|S r   r   )r0   r   rW  s      r    r1   z CBDBSCAN.fit.<locals>.<listcomp>  s    !]!]!]OHe%!]!]!]r   Nc                    t          d t          |                    D             d           }fd|D             }t          |          }|j        k    rd|          _        |j        }xj        dz  c_        nF|          |         j        k     }|                                dk     rj        }xj        dz  c_        ||          _        |D ]u}|         j        &		                    |            
|||| gz              |         j
                            |            |         j                            |           vdS |d	|          _        dS ||          _        dS )
a  Extend the cluster in one direction.

            Results are accumulated to ``self.results``.

            Parameters
            ----------
            topic_index : int
                The topic that might be added to the existing cluster, or which might create a new cluster if necessary.
            current_label : int
                The label of the cluster that might be suitable for ``topic_index``

            c                     g | ]	\  }}||f
S r   r   rV  s      r    r1   z4CBDBSCAN.fit.<locals>.scan_topic.<locals>.<listcomp>  s1       'x u%  r   c                     | d         S rY  r   )r8   s    r    rZ  z2CBDBSCAN.fit.<locals>.scan_topic.<locals>.<lambda>   s
    ad r   r[  c                 0    g | ]\  }}|j         k     |S r   )r#  )r0   r   rW  r   s      r    r1   z4CBDBSCAN.fit.<locals>.scan_topic.<locals>.<listcomp>"  s,    (n(n(n?8UZbeiemZm(n(n(n(nr   TNru   g      ?rR   )ri   r   r7   r   r   
next_labelr#  r6  r   r)   r   addr   )topic_indexcurrent_labelparent_neighborsneighbors_sortedr   num_neighboring_topicsclose_parent_neighbors_maskneighboring_topic_indexamatrix_copyordered_min_similarity
scan_topicr   topic_clustering_resultss           r    rl  z CBDBSCAN.fit.<locals>.scan_topic  s     & +4\+5N+O+O   #N      )o(n(n(nFV(n(n(n%%()B%C%C" &)99 'P@D(5= ! -$(OMOOq(OOO 3?{2KL\2]`d`h2h/27799D@ -(,1,>K(5;/H l l+/0GHN v.556MNNN"
#:MKdhsgtKtuuu,-DE_ccdoppp,-DEX\\]jkkkkl l ! PBD,[9???BO,[9???r   )NN)ra  r~   r7   r   rS   fill_diagonalr   r   ri   popr3  )	r   amatrixmin_distance_per_topicmin_distance_per_topic_sortednext_topic_indexrj  rk  rl  rm  s	   `    @@@@r    r&  zCBDBSCAN.fit  sf   	$
 	$
 S\\**	$
 	$
 	$
  ||~~ 	q)))!o!o9UaUeUeklUeUmUmKnKn!o!o!o(./EKgKg(h(h(h%!]!]?\!]!]!]A	P A	P A	P A	P A	P A	P A	P A	P A	P A	PH ())Q. 	)599!<<J'((( ())Q. 	) 0r   N)r   r   r   rM  r   r&  r   r   r    r$  r$    sB        ! !F' ' '`0 `0 `0 `0 `0r   r$  r`   )5rM  loggingr  multiprocessingr   r   r   r   typingr   r   r   numpyrS   scipy.spatial.distancer	   dataclassesr
   gensimr   gensim.modelsr   r   r   gensim.utilsr   	getLoggerr   r>   r   iinfoint32r<   rq   r   r"   r'   r-   r4   rB   rE   rP   r[   r]   rk   r   r   r   r   r   r   r   r$  r   r   r    <module>r     s  W Wp  				 7 7 7 7 7 7 7 7 7 7     & & & & & & & & & &     ) ) ) ) ) ) ! ! ! ! ! !       ; ; ; ; ; ; ; ; ; ; ! ! ! ! ! ! 
	8	$	$ *. & BHRX&&*  ' ' ' ' ' ' ' '        	O 	O 	O5 5 5r r r
$ $ $N  B  (   9 9 9 9H H H:M M M`& & & &R  0C C CL  0/% /% /%dn	. n	. n	. n	. n	.( n	. n	. n	.bR0 R0 R0 R0 R0 R0 R0 R0 R0 R0r   