
    c                     t    d Z ddlZddlZddlmZmZmZ  ej        e          Z	 G d dej
                  ZdS )a  This module allows simple Bag of Words (BoW) represented corpus to be transformed into log entropy space.
It implements Log Entropy Model that produces entropy-weighted logarithmic term frequency representation.

Empirical study by Lee et al. 2015 [1]_ suggests log entropy-weighted model yields better results among other forms of
representation.

References
----------
.. [1] Lee et al. 2005. An Empirical Evaluation of Models of Text Document Similarity.
       https://escholarship.org/uc/item/48g155nq

    N)
interfacesmatutilsutilsc                   ,    e Zd ZdZddZd Zd Zd ZdS )	LogEntropyModela  Objects of this class realize the transformation between word-document co-occurrence matrix (int)
    into a locally/globally weighted matrix (positive floats).

    This is done by a log entropy normalization, optionally normalizing the resulting documents to unit length.
    The following formulas explain how o compute the log entropy weight for term :math:`i` in document :math:`j`:

    .. math::

        local\_weight_{i,j} = log(frequency_{i,j} + 1)

        P_{i,j} = \frac{frequency_{i,j}}{\sum_j frequency_{i,j}}

        global\_weight_i = 1 + \frac{\sum_j P_{i,j} * log(P_{i,j})}{log(number\_of\_documents + 1)}

        final\_weight_{i,j} = local\_weight_{i,j} * global\_weight_i

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.models import LogEntropyModel
        >>> from gensim.test.utils import common_texts
        >>> from gensim.corpora import Dictionary
        >>>
        >>> dct = Dictionary(common_texts)  # fit dictionary
        >>> corpus = [dct.doc2bow(row) for row in common_texts]  # convert to BoW format
        >>> model = LogEntropyModel(corpus)  # fit model
        >>> vector = model[corpus[1]]  # apply model to document

    Tc                 p    || _         d| _        d| _        i | _        ||                     |           dS dS )a4  

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Input corpus in BoW format.
        normalize : bool, optional
            If True, the resulted log entropy weighted vector will be normalized to length of 1,
            If False - do nothing.

        r   N)	normalizen_docsn_wordsentr
initialize)selfcorpusr	   s      >lib/python3.11/site-packages/gensim/models/logentropy_model.py__init__zLogEntropyModel.__init__;   sJ     #	 	$OOF#####	$ 	$    c                 @    | j         j        d| j        d| j        dS )Nz<n_docs=z
, n_words=>)	__class____name__r
   r   )r   s    r   __str__zLogEntropyModel.__str__N   s*    .2n.E.E.Et{{{TXT`T`T`aar   c                    t                               d           i }d\  }}t          |          D ]_\  }}|dz  dk    rt                               d|           |t          |          z  }|D ]!\  }}|                    |d          |z   ||<   "`|dz   | _        || _        t                               d| j        t          |          | j                   t                               d           d}t          |          D ]v\  }}|D ]n\  }	}
t          |
          ||	         z  t          j
        t          |
          ||	         z            z  }| j                            |	d	          |z   | j        |	<   ow||k    rt          d
          t                               d           | j        D ]7}	d| j        |	         t          j
        | j        dz             z  z   | j        |	<   8dS )a#  Calculates the global weighting for all terms in a given corpus and transforms the simple
        count representation into the log entropy normalized space.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Corpus is BoW format

        zcalculating counts)r   i'  r   z!PROGRESS: processing document #%i   z]calculating global log entropy weights for %i documents and %i features (%i matrix non-zeros)ziterating over corpusg        z;LogEntropyModel doesn't support generators as training dataziterating over keysN)loggerinfo	enumeratelengetr
   r   debugfloatmathlogr   
ValueError)r   r   	glob_freqglob_num_wordsdoc_nobowterm_id
term_countdoc_no2keyfreqps               r   r   zLogEntropyModel.initializeQ   s    	()))	!&$V,, 	L 	LKFC~" I?HHHc#hh&N'* L L#%.]]7A%>%>%K	'""L qj% 	kKY	
 	
 	
 	,--- %f-- 	= 	=LGS  = =	T4[[9S>1TXeDkkIVYN>Z5[5[[!%sC!8!81!<	#= f 	\Z[[[*+++9 	L 	LC3$(4;?2K2K!KKDIcNN	L 	Lr   c                      t          j        |          \  }}|r                     |          S  fd|D             } j        rt	          j        |          }|S )a&  Get log entropy representation of the input vector and/or corpus.

        Parameters
        ----------
        bow : list of (int, int)
            Document in BoW format.

        Returns
        -------
        list of (int, float)
            Log-entropy vector for passed `bow`.

        c                     g | ]B\  }}|j         v |t          j        |d z             j                             |          z  fCS )r   )r   r"   r#   r   ).0r)   tfr   s      r   
<listcomp>z/LogEntropyModel.__getitem__.<locals>.<listcomp>   s`     
 
 
$)#
dhrAv&&w)?)??@
 
 
r   )r   	is_corpus_applyr	   r   unitvec)r   r(   r4   vectors   `   r   __getitem__zLogEntropyModel.__getitem__}   s{     --	3 	$;;s###
 
 
 
"
 
 

 > 	.%f--Fr   N)T)r   
__module____qualname____doc__r   r   r   r8    r   r   r   r      sf         >$ $ $ $&b b b*L *L *LX    r   r   )r;   loggingr"   gensimr   r   r   	getLoggerr   r   TransformationABCr   r<   r   r   <module>rA      s       . . . . . . . . . .		8	$	$} } } } }j2 } } } } }r   