
    cB                         d Z ddlmZmZ ddlmZ ddlZddlZddlm	Z	m
Z
 ddlZ ej        e          Z G d de	j        e          Z G d	 d
e          Z G d de          Z G d de          ZdS )a  This module implements functionality related to the `Okapi Best Matching
<https://en.wikipedia.org/wiki/Okapi_BM25>`_ class of bag-of-words vector space models.

Robertson and Zaragoza [1]_ describe the original algorithm and its modifications.

.. [1] Robertson S., Zaragoza H. (2015). `The Probabilistic Relevance Framework: BM25 and
   Beyond, <http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf>`_.

    )ABCMetaabstractmethod)defaultdictN)
interfacesutilsc                   L    e Zd ZdZddZed             Zed             Zd ZdS )BM25ABCzObjects of this abstract class realize the transformation between word-document co-occurrence
    matrix (int) into a BM25 matrix (positive floats). Concrete subclasses of this abstract class
    implement different BM25 scoring functions.

    Nc                 "   d\  | _         | _        |rx|rt                              d           t	          |j                                                  }||j        z  | _         |                     |j	        |j                  | _        dS |rt          d           }d}d}|D ]E}|t          |          z  }t          d |D                       D ]}||xx         dz  cc<   |dz  }F||z  | _         |                     ||          | _        dS dS )a  Pre-compute the average length of a document and inverse term document frequencies,
        which will be used to weight term frequencies for the documents.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int) or None, optional
            An input corpus, which will be used to compute the average length of a document and
            inverse term document frequencies. If None, then `dictionary` will be used to compute
            the statistics. If both `corpus` and `dictionary` are None, the statistics will be left
            unintialized. Default is None.
        dictionary : :class:`~gensim.corpora.Dictionary`
            An input dictionary, which will be used to compute the average length of a document and
            inverse term document frequencies.  If None, then `corpus` will be used to compute the
            statistics. If both `corpus` and `dictionary` are None, the statistics will be left
            unintialized. Default is None.

        Attributes
        ----------
        avgdl : float
            The average length of a document.
        idfs : dict of (int, float)
            A mapping from term ids to inverse term document frequencies.

        NNzDconstructor received both corpus and dictionary; ignoring the corpusc                      dS )Nr    r       7lib/python3.11/site-packages/gensim/models/bm25model.py<lambda>z"BM25ABC.__init__.<locals>.<lambda>A   s    a r   r   c              3       K   | ]	\  }}|V  
d S Nr   ).0term_id_s      r   	<genexpr>z#BM25ABC.__init__.<locals>.<genexpr>F   s&      "A"Azw7"A"A"A"A"A"Ar      N)avgdlidfsloggerwarningsumcfsvaluesnum_docsprecompute_idfsdfsr   lenset)selfcorpus
dictionary
num_tokensr!   r   bowr   s           r   __init__zBM25ABC.__init__    s5   2 !+
DI 	 gefffZ^224455J#j&99DJ,,Z^Z=PQQDIII 	ii((CJH  c#hh&
""A"AS"A"A"AAA & &GLLLA%LLLLA#h.DJ,,S(;;DIIIDr   c                     dS )a  Precompute inverse term document frequencies, which will be used to weight term frequencies
        for the documents.

        Parameters
        ----------
        dfs : dict of (int, int)
            A mapping from term ids to term document frequencies.
        num_docs : int
            The total number of documents in the training corpus.

        Returns
        -------
        idfs : dict of (int, float)
            A mapping from term ids to inverse term document frequencies.

        Nr   )r$   r!   r   s      r   r    zBM25ABC.precompute_idfsN   s	    $ 	r   c                     dS )a  Compute vector space weights for a set of terms in a document.

        Parameters
        ----------
        num_tokens : int
            The number of tokens in the document.
        term_frequencies : ndarray
            1D array of term frequencies.
        idfs : ndarray
            1D array of inverse term document frequencies.

        Returns
        -------
        term_weights : ndarray
            1D array of vector space weights.

        Nr   )r$   r'   term_frequenciesr   s       r   get_term_weightszBM25ABC.get_term_weightsb   s	    & 	r   c                    t          j        |          \  }}|r|                     |          S t          d |D                       }g g g }}}|D ]^\  }}|                    |           |                    |           |                    | j                            |          pd           _t          j        |          t          j        |          }}| 	                    |||          }	d t          ||	          D             }
|
S )Nc              3       K   | ]	\  }}|V  
d S r   r   )r   r   freqs      r   r   z&BM25ABC.__getitem__.<locals>.<genexpr>|   s&      77-'4777777r   g        c                 6    g | ]\  }}|t          |          fS r   )float)r   r   weights      r   
<listcomp>z'BM25ABC.__getitem__.<locals>.<listcomp>   s7     
 
 
 eFmm$
 
 
r   )r   	is_corpus_applyr   appendr   getnparrayr-   zip)r$   r(   r5   r'   term_idsr,   r   r   term_frequencyterm_weightsvectors              r   __getitem__zBM25ABC.__getitem__w   s!   --	3 	$;;s###77377777
+-r2D"'* 	7 	7#G^OOG$$$##N333KK	g..5#6666!#*:!;!;RXd^^$,,Z9I4PP
 
 8\**
 
 

 r   r   )	__name__
__module____qualname____doc__r)   r   r    r-   r@   r   r   r   r	   r	      sx         
, , , ,\   ^&   ^(    r   r	   )	metaclassc                   0     e Zd ZdZd	 fd	Zd Zd Z xZS )
OkapiBM25Modelav  The original Okapi BM25 scoring function of Robertson et al. [2]_.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.corpora import Dictionary
        >>> from gensim.models import OkapiBM25Model
        >>> from gensim.test.utils import common_texts
        >>>
        >>> dictionary = Dictionary(common_texts)  # fit dictionary
        >>> model = OkapiBM25Model(dictionary=dictionary)  # fit model
        >>>
        >>> corpus = [dictionary.doc2bow(line) for line in common_texts]  # convert corpus to BoW format
        >>> vector = model[corpus[0]]  # apply model to the first corpus document

    References
    ----------
    .. [2] Robertson S. E., Walker S., Jones S., Hancock-Beaulieu M. M., Gatford M. (1995).
       `Okapi at TREC-3 <http://research.microsoft.com/pubs/67649/okapi_trec3.pdf>`_.
       *NIST Special Publication 500-226*.

    N      ?      ?      ?c                 x    |||c| _         | _        | _        t                                          ||           dS )u
  Pre-compute the average length of a document and inverse term document frequencies,
        which will be used to weight term frequencies for the documents.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int) or None, optional
            An input corpus, which will be used to compute the average length of a document and
            inverse term document frequencies. If None, then `dictionary` will be used to compute
            the statistics. If both `corpus` and `dictionary` are None, the statistics will be left
            unintialized. Default is None.
        dictionary : :class:`~gensim.corpora.Dictionary`
            An input dictionary, which will be used to compute the average length of a document and
            inverse term document frequencies.  If None, then `corpus` will be used to compute the
            statistics. If both `corpus` and `dictionary` are None, the statistics will be left
            unintialized. Default is None.
        k1 : float
            A positive tuning parameter that determines the impact of the term frequency on its BM25
            weight. Singhal [5]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5.
        b : float
            A tuning parameter between 0.0 and 1.0 that determines the document length
            normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to
            no length normalization. Singhal [5]_ suggests to set `b` to 0.75, which is the default.
        epsilon : float
            A positive tuning parameter that lower-bounds an inverse document frequency.
            Defaults to 0.25.

        Attributes
        ----------
        k1 : float
            A positive tuning parameter that determines the impact of the term frequency on its BM25
            weight. Singhal [3]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5.
        b : float
            A tuning parameter between 0.0 and 1.0 that determines the document length
            normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to
            no length normalization. Singhal [3]_ suggests to set `b` to 0.75, which is the default.
        epsilon : float
            A positive tuning parameter that lower-bounds an inverse document frequency.
            Defaults to 0.25.

        References
        ----------
        .. [3] Singhal, A. (2001). `Modern information retrieval: A brief overview
           <http://singhal.info/ieee2001.pdf>`_. *IEEE Data Eng. Bull.*, 24(4), 35–43.

        N)k1bepsilonsuperr)   )r$   r%   r&   rL   rM   rN   	__class__s         r   r)   zOkapiBM25Model.__init__   s<    \ )+Aw%,,,,,r   c                 X   d}t                      }g }|                                D ]\\  }}t          j        ||z
  dz             t          j        |dz             z
  }|||<   ||z  }|dk     r|                    |           ]|t          |          z  }	| j        |	z  }
|D ]}|
||<   |S )Nr         ?)dictitemsmathlogr7   r"   rN   )r$   r!   r   idf_sumr   negative_idfsr   r0   idfaverage_idfepss              r   r    zOkapiBM25Model.precompute_idfs   s    vv YY[[ 	. 	.MGT(8d?S011DHTCZ4H4HHCDMsNGQw .$$W---D		)l[($ 	  	 GDMMr   c                 x    ||| j         dz   z  || j         d| j        z
  | j        |z  | j        z  z   z  z   z  z  }|S Nr   rL   rM   r   r$   r'   r,   r   r>   s        r   r-   zOkapiBM25Model.get_term_weights   e    /47Q;?!1DGq46zDF@JMKMQZMX @X 5Y "Y Z [ r   )NNrH   rI   rJ   rA   rB   rC   rD   r)   r    r-   __classcell__rP   s   @r   rG   rG      sf         ./- /- /- /- /- /-b  $      r   rG   c                   0     e Zd ZdZd fd	Zd Zd Z xZS )	LuceneBM25Modelu  The scoring function of Apache Lucene 8 [4]_.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.corpora import Dictionary
        >>> from gensim.models import LuceneBM25Model
        >>> from gensim.test.utils import common_texts
        >>>
        >>> dictionary = Dictionary(common_texts)  # fit dictionary
        >>> corpus = [dictionary.doc2bow(line) for line in common_texts]  # convert corpus to BoW format
        >>>
        >>> model = LuceneBM25Model(dictionary=dictionary)  # fit model
        >>> vector = model[corpus[0]]  # apply model to the first corpus document

    References
    ----------
    .. [4] Kamphuis, C., de Vries, A. P., Boytsov, L., Lin, J. (2020). Which
       BM25 Do You Mean? `A Large-Scale Reproducibility Study of Scoring Variants
       <https://doi.org/10.1007/978-3-030-45442-5_4>`_. In: Advances in Information Retrieval.
       28–34.

    NrH   rI   c                 j    ||c| _         | _        t                                          ||           dS a   Pre-compute the average length of a document and inverse term document frequencies,
        which will be used to weight term frequencies for the documents.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int) or None, optional
            An input corpus, which will be used to compute the average length of a document and
            inverse term document frequencies. If None, then `dictionary` will be used to compute
            the statistics. If both `corpus` and `dictionary` are None, the statistics will be left
            unintialized. Default is None.
        dictionary : :class:`~gensim.corpora.Dictionary`
            An input dictionary, which will be used to compute the average length of a document and
            inverse term document frequencies.  If None, then `corpus` will be used to compute the
            statistics. If both `corpus` and `dictionary` are None, the statistics will be left
            unintialized. Default is None.
        k1 : float
            A positive tuning parameter that determines the impact of the term frequency on its BM25
            weight. Singhal [5]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5.
        b : float
            A tuning parameter between 0.0 and 1.0 that determines the document length
            normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to
            no length normalization. Singhal [5]_ suggests to set `b` to 0.75, which is the default.

        Attributes
        ----------
        k1 : float
            A positive tuning parameter that determines the impact of the term frequency on its BM25
            weight. Singhal [3]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5.
        b : float
            A tuning parameter between 0.0 and 1.0 that determines the document length
            normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to
            no length normalization. Singhal [3]_ suggests to set `b` to 0.75, which is the default.

        NrL   rM   rO   r)   r$   r%   r&   rL   rM   rP   s        r   r)   zLuceneBM25Model.__init__
  5    F a,,,,,r   c                     t                      }|                                D ]9\  }}t          j        |dz             t          j        |dz             z
  }|||<   :|S )Ng      ?rR   rS   rT   rU   rV   r$   r!   r   r   r   r0   rY   s          r   r    zLuceneBM25Model.precompute_idfs0  s\    vv YY[[ 	  	 MGT(8c>**TXdSj-A-AACDMMr   c                 b    |||| j         d| j        z
  | j        |z  | j        z  z   z  z   z  z  }|S r]   r^   r_   s        r   r-   z LuceneBM25Model.get_term_weights7  sZ    /!1DGq46zDF@JMKMQZMX @X 5Y "Y Z [ r   NNrH   rI   ra   rc   s   @r   re   re      sf         0$- $- $- $- $- $-L        r   re   c                   0     e Zd ZdZd fd	Zd Zd Z xZS )	AtireBM25Modelu  The scoring function of Trotman et al. [5]_.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.corpora import Dictionary
        >>> from gensim.models import AtireBM25Model
        >>> from gensim.test.utils import common_texts
        >>>
        >>> dictionary = Dictionary(common_texts)  # fit dictionary
        >>> corpus = [dictionary.doc2bow(line) for line in common_texts]  # convert corpus to BoW format
        >>>
        >>> model = AtireBM25Model(dictionary=dictionary)  # fit model
        >>> vector = model[corpus[0]]  # apply model to the first corpus document

    References
    ----------
    .. [5] Trotman, A., Jia X., Crane M., `Towards an Efficient and Effective Search Engine
       <http://www.cs.otago.ac.nz/homepages/andrew/involvement/2012-SIGIR-OSIR.pdf#page=45>`_,
       In: SIGIR 2012 Workshop on Open Source Information Retrieval. 40–47.

    NrH   rI   c                 j    ||c| _         | _        t                                          ||           dS rg   rh   ri   s        r   r)   zAtireBM25Model.__init__V  rj   r   c                     t                      }|                                D ]3\  }}t          j        |          t          j        |          z
  }|||<   4|S r   rl   rm   s          r   r    zAtireBM25Model.precompute_idfs|  sR    vv YY[[ 	  	 MGT(8$$tx~~5CDMMr   c                 x    ||| j         dz   z  || j         d| j        z
  | j        |z  | j        z  z   z  z   z  z  }|S r]   r^   r_   s        r   r-   zAtireBM25Model.get_term_weights  r`   r   ro   ra   rc   s   @r   rq   rq   >  sf         .$- $- $- $- $- $-L        r   rq   )rD   abcr   r   collectionsr   loggingrU   gensimr   r   numpyr9   	getLoggerrA   r   TransformationABCr	   rG   re   rq   r   r   r   <module>r|      sg    ( ' ' ' ' ' ' ' # # # # # #   $ $ $ $ $ $ $ $     
	8	$	$r r r r rj*g r r r rj_ _ _ _ _W _ _ _DJ J J J Jg J J JZI I I I IW I I I I Ir   