
    cS                         d Z ddlZddlmZ ddlZddlZddlmZm	Z	m
Z
 ddlmZ  ej        e          Zd Zdd	Zd
 Zd Zd Z ed          dd            Z G d dej                  ZdS )zThis module implements functionality related to the `Term Frequency - Inverse Document Frequency
<https://en.wikipedia.org/wiki/Tf%E2%80%93idf>`_ class of bag-of-words vector space models.

    N)partial)
interfacesmatutilsutils)
deprecatedc                    t          | t                    rtt          j        d|           r_t          j        d|           }t	          d                    |                    d          |                    d                              t          | t                    rt          |           dk    rt	          d| z             | \  }}}|d	vr"t	          d
                    |                    |dvr"t	          d                    |                    |dvr"t	          d                    |                    |dk    rd}|dk    rd}|dk    rd}||z   |z   S )a  Check the validity of `smartirs` parameters.

    Parameters
    ----------
    smartirs : str
        `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
        Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
        variants in the vector space model. The mnemonic for representing a combination
        of weights takes the form ddd, where the letters represents the term weighting of the document vector.
        for more information visit `SMART Information Retrieval System
        <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.

    Returns
    -------
    str of (local_letter, global_letter, normalization_letter)

    local_letter : str
        Term frequency weighing, one of:
            * `b` - binary,
            * `t` or `n` - raw,
            * `a` - augmented,
            * `l` - logarithm,
            * `d` - double logarithm,
            * `L` - log average.
    global_letter : str
        Document frequency weighting, one of:
            * `x` or `n` - none,
            * `f` - idf,
            * `t` - zero-corrected idf,
            * `p` - probabilistic idf.
    normalization_letter : str
        Document normalization, one of:
            * `x` or `n` - none,
            * `c` - cosine,
            * `u` - pivoted unique,
            * `b` - pivoted character length.

    Raises
    ------
    ValueError
        If `smartirs` is not a string of length 3 or one of the decomposed value
        doesn't fit the list of permissible values.
    z...\....z(?P<ddd>...)\.(?P<qqq>...)zThe notation {ddd}.{qqq} specifies two term-weighting schemes, one for collection documents ({ddd}) and one for queries ({qqq}). You must train two separate tf-idf models.dddqqq)r	   r
      z"Expected a string of length 3 got btnaldLz=Expected term frequency weight to be one of 'btnaldL', got {}xnftpzGExpected inverse document frequency weight to be one of 'xnftp', got {}xncubz:Expected normalization weight to be one of 'xncub', got {}tnx)
isinstancestrrematch
ValueErrorformatgrouplen)smartirsr   w_tfw_dfw_ns        8lib/python3.11/site-packages/gensim/models/tfidfmodel.pyresolve_weightsr      s   X (C   	
RXk8%D%D 	
6AA99?KK&&KK&& :@ : :
 
 	
 h$$ JH(: J=HIIIOD$9 gX__`deefff7 qbiijnooppp
' cU\\]`aabbb s{ s{ 
cz $;           @        c                 z    |t          j        t          |          | z            t          j        |          z  z   S )a  Compute inverse-document-frequency for a term with the given document frequency `docfreq`:
    :math:`idf = add + log_{log\_base} \frac{totaldocs}{docfreq}`

    Parameters
    ----------
    docfreq : {int, float}
        Document frequency.
    totaldocs : int
        Total number of documents.
    log_base : float, optional
        Base of logarithm.
    add : float, optional
        Offset.

    Returns
    -------
    float
        Inverse document frequency.

    )nplogfloat)docfreq	totaldocslog_baseadds       r   df2idfr+   i   s5    * i((7233bfX6F6FFFFr    c                 H      fd|                                 D             S )a`  Pre-compute the inverse document frequency mapping for all terms.

    Parameters
    ----------
    wglobal : function
        Custom function for calculating the "global" weighting function.
        See for example the SMART alternatives under :func:`~gensim.models.tfidfmodel.smartirs_wglobal`.
    dfs : dict
        Dictionary mapping `term_id` into how many documents did that term appear in.
    total_docs : int
        Total number of documents.

    Returns
    -------
    dict of (int, float)
        Inverse document frequencies in the format `{term_id_1: idfs_1, term_id_2: idfs_2, ...}`.

    c                 0    i | ]\  }}| |          S  r.   ).0termiddf
total_docswglobals      r   
<dictcomp>z#precompute_idfs.<locals>.<dictcomp>   s+    JJJ
FGGB
++JJJr    )items)r3   dfsr2   s   ` `r   precompute_idfsr7      s,    * KJJJJciikkJJJJr    c                    |dk    r| S |dk    rdt          j        |           z   S |dk    r,dt          j        dt          j        |           z             z   S |dk    rdd| z  |                     d          z  z   S |d	k    r(|                     d
                              d          S |dk    rCdt          j        |           z   dt          j        |                     d                    z   z  S dS )a<  Calculate local term weight for a term using the weighting scheme specified in `local_scheme`.

    Parameters
    ----------
    tf : int
        Term frequency.
    local : {'b', 'n', 'a', 'l', 'd', 'L'}
        Local transformation scheme.

    Returns
    -------
    float
        Calculated local weight.

    r   l   dag      ?r   )axisbboolintLN)r$   log2maxastypemean)tflocal_schemes     r   smartirs_wlocalrH      s     s B			 	B272;;		 B271rwr{{?++++		 BcBhQ/00		 Byy  ''...		 BBGBKKAQ(@(@$@AAB Br    c                     |dk    rdS |dk    rt          j        d|z  | z            S |dk    rt          j        |dz   | z            S |dk    r+t          dt          j        d|z  | z
  | z                      S dS )az  Calculate global document weight based on the weighting scheme specified in `global_scheme`.

    Parameters
    ----------
    docfreq : int
        Document frequency.
    totaldocs : int
        Total number of documents.
    global_scheme : {'n', 'f', 't', 'p'}
        Global transformation scheme.

    Returns
    -------
    float
        Calculated global weight.

    r         ?fr   pr   N)r$   rB   rC   )r'   r(   global_schemes      r   smartirs_wglobalrN      s    $  Fs	#	 FwsY0111	#	 Fw	C72333	#	 F1bgsY8GCDDEEEF Fr    z!Function will be removed in 4.0.0Fc                     |dk    r!|rt          j        | |          \  }}| |fS | S |dk    rt          j        | |          S dS )a  Normalize a vector using the normalization scheme specified in `norm_scheme`.

    Parameters
    ----------
    x : numpy.ndarray
        The tf-idf vector.
    norm_scheme : {'n', 'c'}
        Document length normalization scheme.
    return_norm : bool, optional
        Return the length of `x` as well?

    Returns
    -------
    numpy.ndarray
        Normalized array.
    float (only if return_norm is set)
        Norm of `x`.
    r   return_normcN)r   unitvec)r   norm_schemerQ   _lengths        r   smartirs_normalizerW      sn    ( c < 	 (DDDIAvf9H		 <{;;;;< <r    c            	       j     e Zd ZdZdddej        eddddf	dZe fd            Z	d Z
d Zdd
Z xZS )
TfidfModela  Objects of this class realize the transformation between word-document co-occurrence matrix (int)
    into a locally/globally weighted TF-IDF matrix (positive floats).

    Examples
    --------
    .. sourcecode:: pycon

        >>> import gensim.downloader as api
        >>> from gensim.models import TfidfModel
        >>> from gensim.corpora import Dictionary
        >>>
        >>> dataset = api.load("text8")
        >>> dct = Dictionary(dataset)  # fit dictionary
        >>> corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format
        >>>
        >>> model = TfidfModel(corpus)  # fit model
        >>> vector = model[corpus[0]]  # apply model to the first corpus document

    NTg      ?c
                     | _         |||c _         _         _        d\   _         _         _        |t          |          nd _        |	 _	        | _
        d _        |rA j        \  }
}}t          t          |
           _        t          t          |           _        |r|rt                              d           |j        |j        c _         _        |j                                         _        |j                                         _        d |                                D              _        t-           j         j         j                   _        |s| _         n|r                     |           n	 |sdS  j
         |dv rt                              d	           dS |dv r.t1           j                  rt                              d
           |dv r |s|st                              d           dS |dk    rd j        z   j        z   _
        dS |dk    r?dt3           fd|                                D                       z   j        z   _
        dS dS )u  Compute TF-IDF by multiplying a local component (term frequency) with a global component
        (inverse document frequency), and normalizing the resulting documents to unit length.
        Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents

        .. math:: weight_{i,j} = frequency_{i,j} * log_2 \frac{D}{document\_freq_{i}}

        or, more generally

        .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)

        so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int), optional
            Input corpus
        id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional
            Mapping token - id, that was used for converting input data to bag of words format.
        dictionary : :class:`~gensim.corpora.Dictionary`
            If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used.
            to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored).
        wlocals : callable, optional
            Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity`
            (other options: :func:`numpy.sqrt`, `lambda tf: 0.5 + (0.5 * tf / tf.max())`, etc.).
        wglobal : callable, optional
            Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
        normalize : {bool, callable}, optional
            Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`.
        smartirs : str, optional
            SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
            a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.
            The mnemonic for representing a combination of weights takes the form XYZ,
            for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.

            Term frequency weighing:
                * `b` - binary,
                * `t` or `n` - raw,
                * `a` - augmented,
                * `l` - logarithm,
                * `d` - double logarithm,
                * `L` - log average.

            Document frequency weighting:
                * `x` or `n` - none,
                * `f` - idf,
                * `t` - zero-corrected idf,
                * `p` - probabilistic idf.

            Document normalization:
                * `x` or `n` - none,
                * `c` - cosine,
                * `u` - pivoted unique,
                * `b` - pivoted character length.

            Default is 'nfc'.
            For more information visit `SMART Information Retrieval System
            <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
        pivot : float or None, optional
            In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length
            normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 -
            slope) * pivot`.

            You can either set the `pivot` by hand, or you can let Gensim figure it out automatically with the following
            two steps:

                * Set either the `u` or `b` document normalization in the `smartirs` parameter.
                * Set either the `corpus` or `dictionary` parameter. The `pivot` will be automatically determined from
                  the properties of the `corpus` or `dictionary`.

            If `pivot` is None and you don't follow steps 1 and 2, then pivoted document length normalization will be
            disabled. Default is None.

            See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.
        slope : float, optional
            In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length
            normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 -
            slope) * pivot`.

            Setting the `slope` to 0.0 uses only the `pivot` as the norm, and setting the `slope` to 1.0 effectively
            disables pivoted document length normalization. Singhal [2]_ suggests setting the `slope` between 0.2 and
            0.3 for best results. Default is 0.25.

            See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.

        References
        ----------
        .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length
           Normalization <http://singhal.info/pivoted-dln.pdf>`_. *SIGIR Forum*, 51, 176–184.
        .. [2] Singhal, A. (2001). `Modern information retrieval: A brief overview <http://singhal.info/ieee2001.pdf>`_.
           *IEEE Data Eng. Bull.*, 24(4), 35–43.

        )NNNN-q=)rG   )rM   z_constructor received both corpus and explicit inverse document frequencies; ignoring the corpusc                 4    i | ]\  }}|t          |          S r.   )r   )r/   r0   terms      r   r4   z'TfidfModel.__init__.<locals>.<dictcomp>{  s$    WWWLFDfc$iiWWWr    ubz0constructor received pivot; ignoring smartirs[2]z1constructor received smartirs; ignoring normalizezBconstructor received no corpus or dictionary; ignoring smartirs[2]urJ   r>   c              3   T   K   | ]"}j         |         j        |         d z   z  V  #dS rJ   N)cfs	term_lens)r/   r0   selfs     r   	<genexpr>z&TfidfModel.__init__.<locals>.<genexpr>  sL       # #FL DN6$:S$@A# # # # # #r    )id2wordwlocalr3   	normalizenum_docsnum_nnzidfsr   r   slopepivotepsr   rH   rN   loggerwarningrb   copyr6   r5   rc   r7   
initializecallablesumkeys)rd   corpusrf   
dictionaryrg   r3   rh   r   rm   rl   n_tfn_dfn_ns   `            r   __init__zTfidfModel.__init__  s   | 4:GY1T\4>1A.t|TY5=S111t

 	I"mOD$!/EEEDK"#34HHHDL 	  u   +5*=z?Q'DM4<!~**,,DH!~**,,DHWWJDTDTDVDVWWWDN'dhNNDI *) 	OOF####   	F: 	d{ SQRRRF$; 	P8DN33 	PNNNOOO$; 	z 	& 	NN_`````CZ 	t|+dm;DJJJCZ 	s # # # #PZP_P_PaPa# # #     DJJJ	 	r    c                 R    t          t          |           j        |i |}t          |d          sGd|_        t
                              d| j                   t
                              d|j                   t          |d          sGd|_        t
                              d| j                   t
                              d|j                   t          |d	          sGd|_	        t
                              d
| j                   t
                              d|j	                   |S )zLoad a previously saved TfidfModel class. Handles backwards compatibility from
        older TfidfModel versions which did not use pivoted document normalization.

        rm   Nz,older version of %s loaded without pivot argzSetting pivot to %s.rl   g?z,older version of %s loaded without slope argzSetting slope to %s.r   z/older version of %s loaded without smartirs argzSetting smartirs to %s.)
superrY   loadhasattrrm   ro   info__name__rl   r   )clsargskwargsmodel	__class__s       r   r~   zTfidfModel.load  s    ,j#&&+T<V<<ug&& 	=EKKKFUUUKK.<<<ug&& 	=EKKKFUUUKK.<<<uj)) 	C!ENKKI3<XXXKK15>BBBr    c                 @    | j         j        d| j        d| j        dS )Nz
<num_docs=z
, num_nnz=>)r   r   ri   rj   )rd   s    r   __str__zTfidfModel.__str__  s*    040G0G0GX\XdXdXdeer    c           
      ^   t                               d           i }d\  }}t          |          D ]_\  }}|dz  dk    rt                               d|           |t          |          z  }|D ]!\  }}|                    |d          dz   ||<   "`|dz   | _        || _        d| _        || _        d| _	        t          | j        | j        | j                  | _        |                     dd	| j         d
|r$t          |                                          dz   nd d| j         d           dS )zCompute inverse document weights, which will be used to modify term frequencies for documents.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Input corpus.

        zcollecting document frequencies)r   i'  r   z!PROGRESS: processing document #%ir:   Nrr   zcalculated IDF weights for z documents and z features (z matrix non-zeros))msg)ro   r   	enumerater   getri   rj   rb   r6   term_lengthsr7   r3   rk   add_lifecycle_eventrC   ru   )rd   rv   r6   numnnzdocnobowr0   rU   s           r   rr   zTfidfModel.initialize  sr    	5666#F++ 	5 	5JE3u}! H?GGGc#hhF  5 5	!ggfa0014F5 	 #DL$(DMJJ	  ?dm ? ?cfLmCPSPXPXPZPZOO^_L_L_lm ? ?"l? ? ? 	! 	
 	
 	
 	
 	
r    r[   c                 P    | _         t          j        |          \  }}|r                     |          S g g }}|D ]/\  }}|                    |           |                    |           0                     t          j        |                    } fdt          ||          D             } j	        r j	        d         }	|	dk    s|	dv r* j
        # j
        t          j        |d          \  }
}|}n|	dk    r6 j
        t          j        |d          \  }
}nt          j        |          }n|	d	k    rt          j        |dd
          \  }
}n|	dk    rt           fd|D                       }nl j        du rt          j         _        n j        du rt          j         _         j
                             |d          \  }
}n                     |          } j
         fd|D             }n,d j        z
   j
        z   j        |z  z    fd|D             }|S )a  Get the tf-idf representation of an input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document in the `sparse Gensim bag-of-words format
            <https://radimrehurek.com/gensim/intro.html#core-concepts>`_,
            or a streamed corpus of such documents.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        Returns
        -------
        vector : list of (int, float)
            TfIdf vector, if `bow` is a single document
        :class:`~gensim.interfaces.TransformedCorpus`
            TfIdf corpus, if `bow` is a corpus.

        c                     g | ]U\  }}t          j                            |d                     j        k    6||j                            |          z  fVS )r"   )absrk   r   rn   )r/   r0   rF   rd   s      r   
<listcomp>z*TfidfModel.__getitem__.<locals>.<listcomp>  sl     
 
 
SvWZA[A[=\=\_c_g=g
R$)--///0
 
 
r       r   r^   NTrP   rR   r_   unique)rQ   normr>   c              3   D   K   | ]\  }}|j         |         d z   z  V  dS ra   )rc   )r/   r0   freqrd   s      r   re   z)TfidfModel.__getitem__.<locals>.<genexpr>
  s9      ]]QUtt~f'='CD]]]]]]r    Fc                 N    g | ]!\  }}t          |          j        k    ||f"S r.   )r   rn   )r/   r0   weightrd   s      r   r   z*TfidfModel.__getitem__.<locals>.<listcomp>  s:    hhhQTU[Q\Q\_c_gQghFF+hhhr    r:   c                     g | ]A\  }}t          |t                    z            j        k    -||t                    z  fBS r.   )r   r&   rn   )r/   r0   r   pivoted_normrd   s      r   r   z*TfidfModel.__getitem__.<locals>.<listcomp>  sa       "FFvl 3 3344tx?%"5"556  r    )rn   r   	is_corpus_applyappendrg   r$   arrayzipr   rm   r   rS   rt   rh   identityrl   )rd   r   rn   r   termid_arraytf_arrayr0   rF   vectorrz   rU   old_normnorm_vectorr   s   `            @r   __getitem__zTfidfModel.__getitem__  s   $ --	3 	$;;s###
 "$Rh 	  	 JFB'''OOB;;rx1122
 
 
 
!,99
 
 
 = 	5-"Ccz ^cTk ^dj ^: M"*"26t"L"L"LKAx$ ^: ;"*"26t"L"L"LKAxx"*"26":":KK ^&.v4hWWW88 ^]]]]Y\]]]]]~% 0!)!15( 0!&z 5"nnVnFF88"nnV44: 	hhhh+hhhKK
Ndj84:;PPL    &,  K
 r    )r[   )r   
__module____qualname____doc__r   r   r+   r{   classmethodr~   r   rr   r   __classcell__)r   s   @r   rY   rY      s         & #DT%.4$dRVQ Q Q Qf     [(f f f!
 !
 !
FL L L L L L L Lr    rY   )r!   r"   )F)r   logging	functoolsr   r   numpyr$   gensimr   r   r   gensim.utilsr   	getLoggerr   ro   r   r+   r7   rH   rN   rW   TransformationABCrY   r.   r    r   <module>r      sS   
        				     . . . . . . . . . . # # # # # # 
	8	$	$L L L^G G G G0K K K0B B B<F F F8 /00< < < 10<:n n n n n- n n n n nr    