
    c                     6   d Z ddlZddlZddlmZ ddlZddlmZ ddl	Z	ddl
mZmZ  ej        e          Z ed          Z ed                                          Zd Zd	 Zd
 Z G d dej                  Z G d de          Z G d de          ZeZdS )a	  
Automatically detect common phrases -- aka multi-word expressions, word n-gram collocations -- from
a stream of sentences.

Inspired by:

* `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
  <https://arxiv.org/abs/1310.4546>`_
* `"Normalized (Pointwise) Mutual Information in Collocation Extraction" by Gerlof Bouma
  <https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_


Examples
--------
.. sourcecode:: pycon

    >>> from gensim.test.utils import datapath
    >>> from gensim.models.word2vec import Text8Corpus
    >>> from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
    >>>
    >>> # Create training corpus. Must be a sequence of sentences (e.g. an iterable or a generator).
    >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
    >>> # Each sentence must be a list of string tokens:
    >>> first_sentence = next(iter(sentences))
    >>> print(first_sentence[:10])
    ['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface']
    >>>
    >>> # Train a toy phrase model on our training corpus.
    >>> phrase_model = Phrases(sentences, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)
    >>>
    >>> # Apply the trained phrases model to a new, unseen sentence.
    >>> new_sentence = ['trees', 'graph', 'minors']
    >>> phrase_model[new_sentence]
    ['trees_graph', 'minors']
    >>> # The toy model considered "trees graph" a single phrase => joined the two
    >>> # tokens into a single "phrase" token, using our selected `_` delimiter.
    >>>
    >>> # Apply the trained model to each sentence of a corpus, using the same [] syntax:
    >>> for sent in phrase_model[sentences]:
    ...     pass
    >>>
    >>> # Update the model with two new sentences on the fly.
    >>> phrase_model.add_vocab([["hello", "world"], ["meow"]])
    >>>
    >>> # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
    >>> frozen_model = phrase_model.freeze()
    >>> # Apply the frozen model; same results as before:
    >>> frozen_model[new_sentence]
    ['trees_graph', 'minors']
    >>>
    >>> # Save / load models.
    >>> frozen_model.save("/tmp/my_phrase_model.pkl")
    >>> model_reloaded = Phrases.load("/tmp/my_phrase_model.pkl")
    >>> model_reloaded[['trees', 'graph', 'minors']]  # apply the reloaded model to a sentence
    ['trees_graph', 'minors']

    N)log)getfullargspec)utils
interfacesz-infz; a an the  for of with without at from to in on by  and or c                 V    | |z  }|dk    rt           S ||z
  t          |          z  |z  S )aK  Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations
    of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_.

    Parameters
    ----------
    worda_count : int
        Number of occurrences for first word.
    wordb_count : int
        Number of occurrences for second word.
    bigram_count : int
        Number of co-occurrences for phrase "worda_wordb".
    len_vocab : int
        Size of vocabulary.
    min_count: int
        Minimum collocation count threshold.
    corpus_word_count : int
        Not used in this particular scoring technique.

    Returns
    -------
    float
        Score for given phrase. Can be negative.

    Notes
    -----
    Formula: :math:`\frac{(bigram\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`.

    r   )NEGATIVE_INFINITYfloat)worda_countwordb_countbigram_count	len_vocab	min_countcorpus_word_countdenoms          5lib/python3.11/site-packages/gensim/models/phrases.pyoriginal_scorerr   Y   s;    : +%Ez !  9$e4y@@    c                     ||k    r\t          |          }| |z  }||z  }||z  }	 t          |||z  z            t          |           z  S # t          $ r
 t          cY S w xY wt          S )a  Calculation NPMI score based on `"Normalized (Pointwise) Mutual Information in Colocation Extraction"
    by Gerlof Bouma <https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_.

    Parameters
    ----------
    worda_count : int
        Number of occurrences for first word.
    wordb_count : int
        Number of occurrences for second word.
    bigram_count : int
        Number of co-occurrences for phrase "worda_wordb".
    len_vocab : int
        Not used.
    min_count: int
        Ignore all bigrams with total collected count lower than this value.
    corpus_word_count : int
        Total number of words in the corpus.

    Returns
    -------
    float
        If bigram_count >= min_count, return the collocation score, in the range -1 to 1.
        Otherwise return -inf.

    Notes
    -----
    Formula: :math:`\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`,
    where :math:`prob(word) = \frac{word\_count}{corpus\_word\_count}`

    )r	   r   
ValueErrorr   )	r
   r   r   r   r   r   papbpabs	            r   npmi_scorerr   |   s    > y  !!"344,,,,..	%sb2g''3s88)33 	% 	% 	%$$$$	%
 ! s   %A A A c                     t          |           }|}	 t          |          }t          j        |g|          }n# t          $ r d| fcY S w xY wt          |t                    rd|fS || u rd|fS d| fS )a  Check whether `obj` is a single document or an entire corpus.

    Parameters
    ----------
    obj : object

    Return
    ------
    (bool, object)
        2-tuple ``(is_single_document, new_obj)`` tuple, where `new_obj`
        yields the same sequence as the original `obj`.

    Notes
    -----
    `obj` is a single document if it is an iterable of strings. It is a corpus if it is an iterable of documents.

    TF)iternext	itertoolschainStopIteration
isinstancestr)objobj_iter	temp_iterpeeks       r   
_is_singler&      s    $ CyyHIH~~?D6844   Sy $ X~C h#:s   %9 A
	A
c                   P     e Zd ZdZd Zd Zd Zd Zd Ze	 fd            Z
 xZS )_PhrasesTransformationz
    Abstract base class for :class:`~gensim.models.phrases.Phrases` and
    :class:`~gensim.models.phrases.FrozenPhrases`.

    c                 .    t          |          | _        d S )N)	frozensetconnector_words)selfr+   s     r   __init__z_PhrasesTransformation.__init__   s    (99r   c                      t          d          )zScore a single phrase candidate.

        Returns
        -------
        (str, float)
            2-tuple of ``(delimiter-joined phrase, phrase score)`` for a phrase,
            or ``(None, None)`` if not a phrase.
        z*ABC: override this method in child classes)NotImplementedError)r,   word_aword_b
in_betweens       r   score_candidatez&_PhrasesTransformation.score_candidate   s     ""NOOOr   c              #     K   dg }}|D ]m}|| j         vrD|r=|                     |||          \  }}|||fV  dg }}4|dfV  |D ]}|dfV  	|g }}J|g }}O|r|                    |           g|dfV  n|r|dfV  |D ]
}|dfV  	dS dS )a  Analyze a sentence, concatenating any detected phrases into a single token.

        Parameters
        ----------
        sentence : iterable of str
            Token sequence representing the sentence to be analyzed.

        Yields
        ------
        (str, {float, None})
            Iterate through the input sentence tokens and yield 2-tuples of:
            - ``(concatenated_phrase_tokens, score)`` for token sequences that form a phrase.
            - ``(word, None)`` if the token is not a part of a phrase.

        N)r+   r3   append)r,   sentencestart_tokenr2   wordphrasescorews           r   analyze_sentencez'_PhrasesTransformation.analyze_sentence   s@       #'Z 	% 	%D4// %  7$($8$8dJ$W$WMFE 	;$em+++26Z *4////!+ * *A"#T'MMMM26Z /3BKK %%%d++++ *$$$$ 	t####  g	 	 r   c                     t          |          \  }}|s|                     |          S d |                     |          D             S )a  Convert the input sequence of tokens ``sentence`` into a sequence of tokens where adjacent
        tokens are replaced by a single token if they form a bigram collocation.

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        Parameters
        ----------
        sentence : {list of str, iterable of list of str}
            Input sentence or a stream of sentences.

        Return
        ------
        {list of str, iterable of list of str}
            Sentence with phrase tokens joined by ``self.delimiter``, if input was a single sentence.
            A generator of such sentences if input was a corpus.

s        c                     g | ]\  }}|S  r?   ).0token_s      r   
<listcomp>z6_PhrasesTransformation.__getitem__.<locals>.<listcomp>.  s    FFF(%FFFr   )r&   _applyr<   )r,   r6   	is_singles      r   __getitem__z"_PhrasesTransformation.__getitem__  sT    ( )22	8 	) ;;x(((FFd&;&;H&E&EFFFFr   c                 X    i }|D ]$}|                      |          D ]\  }}||||<   %|S )a  Get all unique phrases (multi-word expressions) that appear in ``sentences``, and their scores.

        Parameters
        ----------
        sentences : iterable of list of str
            Text corpus.

        Returns
        -------
        dict(str, float)
           Unique phrases found in ``sentences``, mapped to their scores.

        Example
        -------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
            >>>
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> phrases = Phrases(sentences, min_count=1, threshold=0.1, connector_words=ENGLISH_CONNECTOR_WORDS)
            >>>
            >>> for phrase, score in phrases.find_phrases(sentences).items():
            ...     print(phrase, score)
        )r<   )r,   	sentencesresultr6   r9   r:   s         r   find_phrasesz#_PhrasesTransformation.find_phrases0  sW    6 ! 	+ 	+H!%!6!6x!@!@ + + +%*F6N+ r   c                   
  t          t          |           j        |i |
	 t          
di           }t	          t          |                                                    \  }}t          |t                    r&
fd|                                D             
_	        n:t          |t                    r%
fd|                                D             
_	        n# t          $ r Y nw xY wt          
d          sFt                              d| j                   t                              d           t          
_        t          
d          rt          
j        t"                    rĉ
j        dk    rGt                              d| j                   t                              d	           t          
_        nr
j        d
k    rGt                              d| j                   t                              d           t$          
_        n t'          d| j         d
j         d          t          
d          sRt          
d          r
j        
_        
`n3t                              d| j                   t-                      
_        t          
d          sAt                              d| j                   t                              d           d
_        t          
dd          rt	          t          
j                            }t          |t"                    st                              d| j        t5          
j                             t                              d           i }
j                                        D ]\  }}	|	|t#          |d          <   |
_        t          
j        t"                    st#          
j        d          
_        
S )a  Load a previously saved :class:`~gensim.models.phrases.Phrases` /
        :class:`~gensim.models.phrases.FrozenPhrases` model.

        Handles backwards compatibility from older versions which did not support pluggable scoring functions.

        Parameters
        ----------
        args : object
            See :class:`~gensim.utils.SaveLoad.load`.
        kwargs : object
            See :class:`~gensim.utils.SaveLoad.load`.

        phrasegramsc                 v    i | ]5\  }}t          j                            |          d           |d         6S )utf8encoding   r!   	delimiterjoinr@   keyvalmodels      r   
<dictcomp>z/_PhrasesTransformation.load.<locals>.<dictcomp>i  sP     % % % S ,,S11FCCCSV% % %r   c                 j    i | ]/\  }}t          j                            |          d           |0S )rN   rO   rR   rU   s      r   rY   z/_PhrasesTransformation.load.<locals>.<dictcomp>n  sL     % % % S ,,S11FCCCS% % %r   scoringz3older version of %s loaded without scoring functionzEsetting pluggable scoring method to original_scorer for compatibilitydefaultz;older version of %s loaded with "default" scoring parameterz;setting scoring method to original_scorer for compatibilitynpmiz8older version of %s loaded with "npmi" scoring parameterz7setting scoring method to npmi_scorer for compatibilityzfailed to load z model, unknown scoring ""r+   common_termszCloaded older version of %s, setting connector_words to an empty setr   z4older version of %s loaded without corpus_word_countzFsetting corpus_word_count to 0, do not use it in your scoring functionr   vocabNz6old version of %s loaded, upgrading %i words in memoryz<re-save the loaded model to avoid this upgrade in the futurerN   rO   )superr(   loadgetattrr   r   itemsr    tuplerL   r   hasattrloggerwarning__name__r   r[   r!   r   r   r_   r+   r*   r   r`   infolenrS   )clsargskwargsrL   	componentr:   r8   r`   rV   valuerX   	__class__s             @r   rb   z_PhrasesTransformation.loadR  s    8,c227HHH	!%;;K#D):):)<)<$=$=>>Iu%'' 
% % % %$/$5$5$7$7% % %!! Iu-- % % % %$/$5$5$7$7% % %!  	 	 	D	
 ui(( 	,NNPRUR^___NNbccc+EM5)$$ 	p%--- 
p=I- 	pNN#`bebnoooNN#`aaa$3EMM]f, pNN#]_b_klllNN#\]]]$/EMM$%ns|%n%n^c^k%n%n%nooo u/00 	4un-- 4(-(:%&&dfifrsss(1%u122 	(NNQSVS_```NNcddd&'E# 5'4(( 	$U[))**DdC(( $TVYVbdghmhsdtdtuuuZ[[["'+"3"3"5"5 = =JC7<E#cF33344#%/3// 	D!%/FCCCEOs   B7C 
C)(C))ri   
__module____qualname____doc__r-   r3   r<   rF   rJ   classmethodrb   __classcell__rq   s   @r   r(   r(      s         
: : :	P 	P 	P0 0 0dG G G8     D P P P P [P P P P Pr   r(   c            	       x     e Zd ZdZddddddd e            f fd		Zd
 Zed             Zd Z	d Z
d Zd Z xZS )Phrasesz+Detect phrases based on collocation counts.N   g      $@i ZbrB   i'  r\   c	                    t                                          |           |dk    rt          d          |dk    rdk    rt          d          dk    r|dk     s|dk    rt          d	          t          t                    r/dk    rt
          n!dk    rt          nt          d
 d          g d}	t                    r*fd|	D             }
|
s| _        nt          d|
           || _	        || _
        || _        i | _        d| _        || _        || _        d| _        	 t#          j        t#          j        | j                             n5# t"          j        $ r# t#          j        d| j        j         d          w xY w|]t/          j                    }|                     |           |                     dd|  dt/          j                    |z
  dd           dS dS )a  

        Parameters
        ----------
        sentences : iterable of list of str, optional
            The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
            for such examples.
        min_count : float, optional
            Ignore all words and bigrams with total collected count lower than this value.
        threshold : float, optional
            Represent a score threshold for forming the phrases (higher means fewer phrases).
            A phrase of words `a` followed by `b` is accepted if the score of the phrase is greater than threshold.
            Heavily depends on concrete scoring-function, see the `scoring` parameter.
        max_vocab_size : int, optional
            Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
            to keep memory under control. The default of 40M needs about 3.6GB of RAM. Increase/decrease
            `max_vocab_size` depending on how much available memory you have.
        delimiter : str, optional
            Glue character used to join collocation tokens.
        scoring : {'default', 'npmi', function}, optional
            Specify how potential phrases are scored. `scoring` can be set with either a string that refers to a
            built-in scoring function, or with a function with the expected parameter names.
            Two built-in scoring functions are available by setting `scoring` to a string:

            #. "default" - :func:`~gensim.models.phrases.original_scorer`.
            #. "npmi" - :func:`~gensim.models.phrases.npmi_scorer`.
        connector_words : set of str, optional
            Set of words that may be included within a phrase, without affecting its scoring.
            No phrase can start nor end with a connector word; a phrase may contain any number of
            connector words in the middle.

            **If your texts are in English, set** ``connector_words=phrases.ENGLISH_CONNECTOR_WORDS``.

            This will cause phrases to include common English articles, prepositions and
            conjuctions, such as `bank_of_america` or `eye_of_the_beholder`.

            For other languages or specific applications domains, use custom ``connector_words``
            that make sense there: ``connector_words=frozenset("der die das".split())`` etc.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
            >>>
            >>> # Load corpus and train a model.
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> phrases = Phrases(sentences, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)
            >>>
            >>> # Use the model to detect phrases in a new sentence.
            >>> sent = [u'trees', u'graph', u'minors']
            >>> print(phrases[sent])
            [u'trees_graph', u'minors']
            >>>
            >>> # Or transform multiple sentences at once.
            >>> sents = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']]
            >>> for phrase in phrases[sents]:
            ...     print(phrase)
            [u'trees_graph', u'minors']
            [u'graph_minors']
            >>>
            >>> # Export a FrozenPhrases object that is more efficient but doesn't allow any more training.
            >>> frozen_phrases = phrases.freeze()
            >>> print(frozen_phrases[sent])
            [u'trees_graph', u'minors']

        Notes
        -----

        The ``scoring="npmi"`` is more robust when dealing with common words that form part of common bigrams, and
        ranges from -1 to 1, but is slower to calculate than the default ``scoring="default"``.
        The default is the PMI-like scoring as described in `Mikolov, et. al: "Distributed
        Representations of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_.

        To use your own custom ``scoring`` function, pass in a function with the following signature:

        * ``worda_count`` - number of corpus occurrences in `sentences` of the first token in the bigram being scored
        * ``wordb_count`` - number of corpus occurrences in `sentences` of the second token in the bigram being scored
        * ``bigram_count`` - number of occurrences in `sentences` of the whole bigram
        * ``len_vocab`` - the number of unique tokens in `sentences`
        * ``min_count`` - the `min_count` setting of the Phrases class
        * ``corpus_word_count`` - the total number of tokens (non-unique) in `sentences`

        The scoring function must accept all these parameters, even if it doesn't use them in its scoring.

        The scoring function **must be pickleable**.

        )r+   r   zmin_count should be at least 1r\   z0threshold should be positive for default scoringr]   rQ   z5threshold should be between -1 and 1 for npmi scoringzunknown scoring method string z
 specifiedr
   r   r   r   r   r   c                 B    g | ]}|t                    d          v|S )r   )
getargspec)r@   paramr[   s     r   rC   z$Phrases.__init__.<locals>.<listcomp>"  s2    ```ET[I\I\]^I_<_`u```r   z-scoring function missing expected parameters zCustom scoring function in z must be pickle-ableNcreatedzbuilt  in .2fsmsg)ra   r-   r   r    r!   r   r   callabler[   r   	thresholdmax_vocab_sizer`   
min_reducerS   progress_perr   pickleloadsdumpsPickleErrorrq   ri   time	add_vocabadd_lifecycle_event)r,   rH   r   r   r   rS   r   r[   r+   scoring_paramsmissingstartrq   s          `    r   r-   zPhrases.__init__  su   B 	999> 	?=>>>> 	Qg2 	QOPPPf 	V)b. 	VIM 	VTUUU
 gs## 	W)# W)F" W% !U'!U!U!UVVV
 
 
 G 	\````.```G \& !ZQX!Z!Z[[["",
"(!"	rLdl334444! 	r 	r 	r$%p4>CZ%p%p%pqqq	r  	cIKKENN9%%%$$Y4aT4a4aty{{UZGZ4a4a4a4a$bbbbb	c 	cs   0+E 2Fc                 l    d| j         j        t          | j                  | j        | j        | j        fz  S )Nz;%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>)rq   ri   rk   r`   r   r   r   r,   s    r   __str__zPhrases.__str__<  s5    LN#S__dnND/P
 
 	
r   c           	         d\  }}}i }t                               d           t          |           D ]\  }}	||z  dk    r*t                               d||t          |                     dg }}
|	D ]}||vrl|                    |d          dz   ||<   |
It          j        |
g||g          }|                    |          }|                    |d          dz   ||<   |g }}
n|
|                    |           |dz  }t          |          |k    rt          j
        ||           |dz  }t                               dt          |          ||dz              |||fS )z@Collect unigram and bigram counts from the `sentences` iterable.)r|   r   rQ   z%collecting all words and their countsr   z?PROGRESS: at sentence #%i, processed %i words and %i word typesNrQ   zWcollected %i token types (unigram + bigrams) from a corpus of %i words and %i sentences)rg   rj   	enumeraterk   getr   r   rT   r5   r   prune_vocab)rH   r   rS   r+   r   sentence_nototal_wordsr   r`   r6   r7   r2   r8   phrase_tokensjoined_phrase_tokens                  r   _learn_vocabzPhrases._learn_vocabB  s    08,[*;<<<%.y%9%9 	  	 !K\)Q. Uc%jj   '+BK  
! 
!. ,"'))D!"4"4q"8E$K" [(1zTXSY(Z(Z.7nn].K.K+5:YY?RTU5V5VYZ5Z12.2BKK  ,%%d+++q 5zzN*  !%444a
eJJ[1_	
 	
 	
 5+--r   c                 |   |                      || j        | j        | j        | j                  \  }}}| xj        |z  c_        | j        rt                              dt          |          |            t          | j        |          | _        |                                D ]+\  }}| j                            |d          |z   | j        |<   ,t          | j                  | j        k    r/t          j        | j        | j                   | xj        dz  c_        n|| _        t                              d|            dS )a~  Update model parameters with new `sentences`.

        Parameters
        ----------
        sentences : iterable of list of str
            Text corpus to update this model's parameters from.

        Example
        -------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
            >>>
            >>> # Train a phrase detector from a text corpus.
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> phrases = Phrases(sentences, connector_words=ENGLISH_CONNECTOR_WORDS)  # train model
            >>> assert len(phrases.vocab) == 37
            >>>
            >>> more_sentences = [
            ...     [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
            ...     [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes'],
            ... ]
            >>>
            >>> phrases.add_vocab(more_sentences)  # add new sentences to model
            >>> assert len(phrases.vocab) == 60

        )r   rS   r   r+   zmerging %i counts into %sr   rQ   z	merged %sN)r   r   rS   r   r+   r   r`   rg   rj   rk   maxr   rd   r   r   r   )r,   rH   r   r`   r   r8   counts          r   r   zPhrases.add_vocabe  s1   F *.):):d&9T^*D<P *; *
 *
&
E;
 	+-: 	KK3SZZFFF!$/:>>DO${{}} C Ce#':>>$#:#:U#B
4  4:!44 %!$*do>>>1$ DJK&&&&&r   c                    | j                             |d          }|dk    rdS | j                             |d          }|dk    rdS | j                            |g|z   |gz             }| j                             |d          }|dk    rdS |                     |||t          | j                   | j        | j                  }|| j        k    rdS ||fS )Nr   NNr}   )	r`   r   rS   rT   r[   rk   r   r   r   )	r,   r0   r1   r2   
word_a_cnt
word_b_cntr9   
phrase_cntr:   s	            r   r3   zPhrases.score_candidate  s    Z^^FA..
? 	:Z^^FA..
? 	:$$fX
%:fX%EFFZ^^FA..
? 	:"
$*ooSWSi  
 
 DN" 	:u}r   c                      t          |           S )a  
        Return an object that contains the bare minimum of information while still allowing
        phrase detection. See :class:`~gensim.models.phrases.FrozenPhrases`.

        Use this "frozen model" to dramatically reduce RAM footprint if you don't plan to
        make any further changes to your `Phrases` model.

        Returns
        -------
        :class:`~gensim.models.phrases.FrozenPhrases`
            Exported object that's smaller, faster, but doesn't support model updates.

        )FrozenPhrasesr   s    r   freezezPhrases.freeze  s     T"""r   c                     i | j         }}|D ]e}|                    | j                  }t          |          dk     r0|                     |d         |d         |dd                   \  }}||||<   f|S )zExtract all found phrases.

        Returns
        ------
        dict(str, float)
            Mapping between phrases and their scores.

           r   r|   rQ   )r`   splitrS   rk   r3   )r,   rI   source_vocabrA   unigramsr9   r:   s          r   export_phraseszPhrases.export_phrases  s      "4:! 	' 	'E{{4>22H8}}q   00!hrlHUVWYUYN[[MFE '!&vr   )ri   rr   rs   rt   r*   r-   r   staticmethodr   r   r3   r   r   rv   rw   s   @r   ry   ry     s        55 !A#syy{{Qc Qc Qc Qc Qc Qcf
 
 
  .  . \ .D5' 5' 5'n  2# # #       r   ry   c                   $    e Zd ZdZd Zd Zd ZdS )r   az  Minimal state & functionality exported from a trained :class:`~gensim.models.phrases.Phrases` model.

    The goal of this class is to cut down memory consumption of `Phrases`, by discarding model state
    not strictly needed for the phrase detection task.

    Use this instead of `Phrases` if you do not need to update the bigram statistics with new documents any more.

    c           
      x   |j         | _         |j        | _        |j        | _        |j        | _        |j        | _        t
                              d|           t          j                    }|                                | _	        | 
                    dd|  d| dt          j                    |z
  dd           d	S )
a  

        Parameters
        ----------
        phrases_model : :class:`~gensim.models.phrases.Phrases`
            Trained phrases instance, to extract all phrases from.

        Notes
        -----
        After the one-time initialization, a :class:`~gensim.models.phrases.FrozenPhrases` will be much
        smaller and faster than using the full :class:`~gensim.models.phrases.Phrases` model.

        Examples
        ----------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.models.word2vec import Text8Corpus
            >>> from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
            >>>
            >>> # Load corpus and train a model.
            >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
            >>> phrases = Phrases(sentences, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)
            >>>
            >>> # Export a FrozenPhrases object that is more efficient but doesn't allow further training.
            >>> frozen_phrases = phrases.freeze()
            >>> print(frozen_phrases[sent])
            [u'trees_graph', u'minors']

        zexporting phrases from %sr   z	exported z from r   r   r   r   N)r   r   rS   r[   r+   rg   rj   r   r   rL   r   )r,   phrases_modelr   s      r   r-   zFrozenPhrases.__init__  s    > '0&0&0$,,</???	(7799  0uD0u0u0u0u[_[d[f[fin[n0u0u0u0u vvvvvr   c                 `    d| j         j        t          | j                  | j        | j        fz  S )Nz*%s<%i phrases, min_count=%s, threshold=%s>)rq   ri   rk   rL   r   r   r   s    r   r   zFrozenPhrases.__str__  s1    ;N#S)9%:%:DNDN?
 
 	
r   c                     | j                             |g|z   |gz             }| j                            |t                    }|| j        k    r||fS dS )Nr   )rS   rT   rL   r   r   r   )r,   r0   r1   r2   r9   r:   s         r   r3   zFrozenPhrases.score_candidate  s\    $$fX
%:fX%EFF $$V->??4>! 	!5= zr   N)ri   rr   rs   rt   r-   r   r3   r?   r   r   r   r     sO         'w 'w 'wR
 
 

    r   r   )rt   loggingr   mathr   r   inspectr   r   r   gensimr   r   	getLoggerri   rg   r	   r   r*   r   ENGLISH_CONNECTOR_WORDSr   r   r&   TransformationABCr(   ry   r   Phraserr?   r   r   <module>r      s  8 8t             0 0 0 0 0 0  $ $ $ $ $ $ $ $ 
	8	$	$E&MM 
 $) UWW	   A  A  AF+! +! +!\! ! !HU U U U UZ9 U U Upp p p p p$ p p pf	= = = = =* = = =@ r   