U
    Lep                  	   @   s2  d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ eddddddddgZG dd dZG dd dZG dd dZ G dd dZ!G dd de!Z"d d! Z#e$d"kr e#  dddddgZ%dS )#a  
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
    N)Counterdefaultdict
namedtuple)reduce)log)BigramCollocationFinder)MLE)padded_everygram_pipeline)BigramAssocMeasures	f_measure)ConditionalFreqDist)FreqDist)sent_tokenize)LazyConcatenation	tokenwrapConcordanceLineleftqueryrightoffset
left_printright_printlinec                   @   sT   e Zd ZdZedd Zdddd fddZd	d
 Zdd ZdddZ	dddZ
dS )ContextIndexa  
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    c                 C   sH   |dkr| |d    nd}|t| d kr<| |d    nd}||fS )z;One left token and one right token, normalized to lowercaser      *START**END*)lowerlen)tokensir   r    r!   h/mounts/lovelace/software/anaconda3/envs/qiime2-amplicon-2024.2/lib/python3.8/site-packages/nltk/text.py_default_context.   s    $zContextIndex._default_contextNc                 C   s   | S Nr!   xr!   r!   r"   <lambda>5       zContextIndex.<lambda>c                    sv   |_ _|r|_nj_ r6 fddD tfddtD _tfddtD _d S )Nc                    s   g | ]} |r|qS r!   r!   ).0t)filterr!   r"   
<listcomp>=   s      z)ContextIndex.__init__.<locals>.<listcomp>c                 3   s(   | ] \}}  | |fV  qd S r$   )_key_context_funcr)   r    wselfr   r!   r"   	<genexpr>>   s    z(ContextIndex.__init__.<locals>.<genexpr>c                 3   s(   | ] \}}  | |fV  qd S r$   )r.   r-   r/   r1   r!   r"   r3   A   s    )r-   _tokensr.   r#   CFD	enumerate_word_to_contexts_context_to_words)r2   r   Zcontext_funcr+   keyr!   )r+   r2   r   r"   __init__5   s    
zContextIndex.__init__c                 C   s   | j S )zw
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        r4   r2   r!   r!   r"   r   E   s    zContextIndex.tokensc                 C   sF   |  |}t| j| }i }| j D ]\}}t|t|||< q&|S )z
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        )r-   setr7   itemsr   )r2   wordZword_contextsscoresr0   Z
w_contextsr!   r!   r"   word_similarity_dictM   s    
z!ContextIndex.word_similarity_dict   c                 C   sv   t t}| j| | D ]D}| j| D ]4}||kr&||  | j| | | j| |  7  < q&qt||jddd | S )NT)r9   reverse)r   intr7   r-   r8   sortedget)r2   r?   nr@   cr0   r!   r!   r"   similar_words\   s    
zContextIndex.similar_wordsFc                    s   fddD fddD fddt tD }ttj |rf|rftddn& spt S t fddD }|S d	S )
a  
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        c                    s   g | ]}  |qS r!   )r-   r)   r0   r<   r!   r"   r,   q   s     z0ContextIndex.common_contexts.<locals>.<listcomp>c                    s   g | ]}t  j| qS r!   )r=   r7   rJ   r<   r!   r"   r,   r   s     c                    s   g | ]} | s| qS r!   r!   )r)   r    )contextswordsr!   r"   r,   s   s      z%The following word(s) were not found: c                 3   s*   | ]"}j | D ]}| kr|V  qqd S r$   )r7   r)   r0   rH   )commonr2   r!   r"   r3   {   s
       z/ContextIndex.common_contexts.<locals>.<genexpr>N)ranger   r   r=   intersection
ValueErrorjoinr   )r2   rL   Zfail_on_unknownemptyfdr!   )rO   rK   r2   rL   r"   common_contextsf   s    zContextIndex.common_contexts)rB   )F)__name__
__module____qualname____doc__staticmethodr#   r:   r   rA   rI   rV   r!   r!   r!   r"   r   &   s   


r   c                   @   sL   e Zd ZdZdd fddZdd Zdd	 Zd
d ZdddZdddZ	dS )ConcordanceIndexzs
    An index that can be used to look up the offset locations at which
    a given word occurs in a document.
    c                 C   s   | S r$   r!   r%   r!   r!   r"   r'      r(   zConcordanceIndex.<lambda>c                 C   sF   || _ || _tt| _t|D ]"\}}| |}| j| | qdS )a  
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        N)r4   r-   r   list_offsetsr6   append)r2   r   r9   indexr?   r!   r!   r"   r:      s    

zConcordanceIndex.__init__c                 C   s   | j S )z{
        :rtype: list(str)
        :return: The document that this concordance index was
            created from.
        r;   r<   r!   r!   r"   r      s    zConcordanceIndex.tokensc                 C   s   |  |}| j| S )z
        :rtype: list(int)
        :return: A list of the offset positions at which the given
            word occurs.  If a key function was specified for the
            index, then given word's key will be looked up.
        )r-   r^   r2   r?   r!   r!   r"   offsets   s    
zConcordanceIndex.offsetsc                 C   s   dt | jt | jf S )Nz+<ConcordanceIndex for %d tokens (%d types)>)r   r4   r^   r<   r!   r!   r"   __repr__   s    zConcordanceIndex.__repr__P   c              	      s@  t |tr|}n|g}|td| d d }|d }g }| |d }t|dd D ].\ } fdd| |D }t||}qZ|r<|D ] d| j  t|  }	| jt	d |   }
| j t|  |  }d|
| d }d|d| }d||	|g}t
|
|	| |||}|| q|S )	z
        Find all concordance lines given the query word.

        Provided with a list of words, these will be found as a phrase.
        rM         r   r   Nc                    s   h | ]}|  d  qS )r   r!   )r)   r   r    r!   r"   	<setcomp>   s     z4ConcordanceIndex.find_concordance.<locals>.<setcomp>)
isinstancer]   r   rS   rb   r6   rE   rQ   r4   maxr   r_   )r2   r?   widthphraseZ
half_widthcontextconcordance_listrb   Zword_offsetsZ
query_wordZleft_contextZright_contextr   r   Z
line_printconcordance_liner!   rg   r"   find_concordance   s:    
	z!ConcordanceIndex.find_concordance   c                 C   sl   | j ||d}|std nLt|t|}td| dt| d t|d| D ]\}}t|j qTdS )a  
        Print concordance lines given the query word.
        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param lines: The number of lines to display (default=25)
        :type lines: int
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param save: The option to save the concordance.
        :type save: bool
        )rk   z
no matcheszDisplaying z of z	 matches:N)rp   printminr   r6   r   )r2   r?   rk   linesrn   r    ro   r!   r!   r"   print_concordance   s    
z"ConcordanceIndex.print_concordanceN)rd   )rd   rq   )
rW   rX   rY   rZ   r:   r   rb   rc   rp   ru   r!   r!   r!   r"   r\      s   

,r\   c                   @   s    e Zd ZdZdd Zdd ZdS )TokenSearchera  
    A class that makes it easier to use regular expressions to search
    over tokenized strings.  The tokenized string is converted to a
    string where tokens are marked with angle brackets -- e.g.,
    ``'<the><window><is><still><open>'``.  The regular expression
    passed to the ``findall()`` method is modified to treat angle
    brackets as non-capturing parentheses, in addition to matching the
    token boundaries; and to have ``'.'`` not match the angle brackets.
    c                 C   s   d dd |D | _d S )N c                 s   s   | ]}d | d V  qdS )<>Nr!   rJ   r!   r!   r"   r3     s     z)TokenSearcher.__init__.<locals>.<genexpr>)rS   _rawr1   r!   r!   r"   r:     s    zTokenSearcher.__init__c                 C   s~   t dd|}t dd|}t dd|}t dd|}t || j}|D ] }|dsJ|drJtd	qJd
d |D }|S )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.text import TokenSearcher
        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        z\srw   rx   z(?:<(?:ry   z)>)z	(?<!\\)\.z[^>]z$Bad regexp for TokenSearcher.findallc                 S   s   g | ]}|d d  dqS )r   z><splitr)   hr!   r!   r"   r,   0  s     z)TokenSearcher.findall.<locals>.<listcomp>)resubfindallrz   
startswithendswithrR   )r2   regexphitsr   r!   r!   r"   r   
  s    
zTokenSearcher.findallN)rW   rX   rY   rZ   r:   r   r!   r!   r!   r"   rv      s   
rv   c                   @   s   e Zd ZdZdZd6ddZdd Zdd	 Zd7ddZd8ddZ	d9ddZ
d:ddZdd Zdd Zdd Zd;ddZd<ddZd d! Zd=d#d$Zd>d'd(Zd)d* Zd+d, Zd-d. Zed/Zd0d1 Zd2d3 Zd4d5 ZdS )?Texta  
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    TNc                 C   s   | j rt|}|| _|r || _ndd|dd krb|dd d}ddd |d| D | _n"ddd |dd	 D d
 | _dS )zv
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        ]NrB   rM   c                 s   s   | ]}t |V  qd S r$   strr)   tokr!   r!   r"   r3   ]  s     z Text.__init__.<locals>.<genexpr>r   c                 s   s   | ]}t |V  qd S r$   r   r   r!   r!   r"   r3   _  s        z...)_COPY_TOKENSr]   r   namer`   rS   )r2   r   r   endr!   r!   r"   r:   N  s     zText.__init__c                 C   s
   | j | S r$   )r   )r2   r    r!   r!   r"   __getitem__e  s    zText.__getitem__c                 C   s
   t | jS r$   )r   r   r<   r!   r!   r"   __len__h  s    zText.__len__O   rq   c                 C   s.   d| j krt| jdd d| _| j|||S )a  
        Prints a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        _concordance_indexc                 S   s   |   S r$   r   sr!   r!   r"   r'     r(   z"Text.concordance.<locals>.<lambda>r9   )__dict__r\   r   r   ru   r2   r?   rk   rt   r!   r!   r"   concordanceo  s    
 zText.concordancec                 C   s4   d| j krt| jdd d| _| j||d| S )a  
        Generate a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        r   c                 S   s   |   S r$   r   r   r!   r!   r"   r'     r(   z'Text.concordance_list.<locals>.<lambda>r   N)r   r\   r   r   rp   r   r!   r!   r"   rn     s    
 zText.concordance_listrB   re   c                    s   d| j kr| j|kr| j|ks|| _|| _ddlm} |d t| j|}|	d |
 fdd t }t||j|| _| jS )a  
        Return collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocation_list()[:2]
            [('United', 'States'), ('fellow', 'citizens')]

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        :rtype: list(tuple(str, str))
        _collocationsr   )	stopwordsenglishre   c                    s   t | dk p|   kS )N   )r   r   )r0   Zignored_wordsr!   r"   r'     r(   z'Text.collocation_list.<locals>.<lambda>)r   Z_numZ_window_sizenltk.corpusr   rL   r   Z
from_wordsr   Zapply_freq_filterZapply_word_filterr
   r]   ZnbestZlikelihood_ratior   )r2   numwindow_sizer   finderZbigram_measuresr!   r   r"   collocation_list  s$    

zText.collocation_listc                 C   s*   dd |  ||D }tt|dd dS )a  
        Print collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE
            United States; fellow citizens; years ago; four years; Federal
            Government; General Government; American people; Vice President; God
            bless; Chief Justice; one another; fellow Americans; Old World;
            Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian
            tribes; public debt; foreign nations


        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        c                 S   s   g | ]\}}|d  | qS rM   r!   r)   Zw1Zw2r!   r!   r"   r,     s    z%Text.collocations.<locals>.<listcomp>; )	separatorN)r   rr   r   )r2   r   r   Zcollocation_stringsr!   r!   r"   collocations  s    
zText.collocationsc                 C   s   | j |S )zJ
        Count the number of times this word appears in the text.
        )r   countra   r!   r!   r"   r     s    z
Text.countc                 C   s   | j |S )zQ
        Find the index of the first occurrence of the word in the text.
        )r   r`   ra   r!   r!   r"   r`     s    z
Text.indexc                 C   s   t d S r$   )NotImplementedError)r2   methodr!   r!   r"   readability  s    zText.readabilityc                    s   d| j kr$t| jdd dd d| _ | jj krt  t fdd D }dd	 |	|D }t
t| nt
d
 dS )a~  
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        _word_context_indexc                 S   s   |   S r$   )isalphar%   r!   r!   r"   r'     r(   zText.similar.<locals>.<lambda>c                 S   s   |   S r$   r   r   r!   r!   r"   r'     r(   )r+   r9   c                 3   s0   | ](}| D ]}| kr|ks|V  qqd S r$   r!   rN   rK   Zwcir?   r!   r"   r3     s   
  zText.similar.<locals>.<genexpr>c                 S   s   g | ]\}}|qS r!   r!   r)   r0   _r!   r!   r"   r,      s     z Text.similar.<locals>.<listcomp>z
No matchesN)r   r   r   r   r   r7   Z
conditionsr=   r   most_commonrr   r   )r2   r?   r   rU   rL   r!   r   r"   similar  s     
  zText.similarc              
   C   s   d| j krt| jdd d| _zJ| j|d}|s<td n*dd ||D }ttd	d
 |D  W n* tk
r } zt| W 5 d}~X Y nX dS )aY  
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param words: The words used to seed the similarity search
        :type words: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        r   c                 S   s   |   S r$   r   r   r!   r!   r"   r'     r(   z&Text.common_contexts.<locals>.<lambda>r   TzNo common contexts were foundc                 S   s   g | ]\}}|qS r!   r!   r   r!   r!   r"   r,     s     z(Text.common_contexts.<locals>.<listcomp>c                 s   s   | ]\}}|d  | V  qdS )r   Nr!   r   r!   r!   r"   r3     s     z'Text.common_contexts.<locals>.<genexpr>N)	r   r   r   r   rV   rr   r   r   rR   )r2   rL   r   rU   Zranked_contextser!   r!   r"   rV     s    
 
zText.common_contextsc                 C   s   ddl m} || | dS )z
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        r   )dispersion_plotN)Z	nltk.drawr   )r2   rL   r   r!   r!   r"   r   !  s    	zText.dispersion_plotr   c                 C   s(   t ||\}}t|d}||| |S )N)order)r	   r   fit)r2   Ztokenized_sentsrG   Z
train_dataZpadded_sentsmodelr!   r!   r"   _train_default_ngram_lm.  s    
zText._train_default_ngram_lmd   *   c           	      C   s   dd t d| jD | _t| dsFtdtjd | j| jdd| _	g }|d	ksZt
d
t||k rt| j	j|||dD ](\}}|dkrq||dkr q|| q||d7 }qZ|rd|d nd}|t|d|  }t| |S )a  
        Print random text, generated using a trigram language model.
        See also `help(nltk.lm)`.

        :param length: The length of text to generate (default=100)
        :type length: int

        :param text_seed: Generation can be conditioned on preceding context.
        :type text_seed: list(str)

        :param random_seed: A random seed or an instance of `random.Random`. If provided,
            makes the random sampling part of generation reproducible. (default=42)
        :type random_seed: int
        c                 S   s   g | ]}| d qS r   r|   )r)   sentr!   r!   r"   r,   D  s    z!Text.generate.<locals>.<listcomp>rM   _trigram_modelzBuilding ngram index...)filer   )rG   r   z!The `length` must be more than 0.)	text_seedrandom_seedz<s>z</s>r   rw   N)r   rS   r   Z_tokenized_sentshasattrrr   sysstderrr   r   AssertionErrorr   r6   generater_   r   )	r2   lengthr   r   Zgenerated_tokensidxtokenprefixZ
output_strr!   r!   r"   r   4  s:    
   
zText.generatec                 G   s   |   j| S )zc
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        )vocabplot)r2   argsr!   r!   r"   r   b  s    z	Text.plotc                 C   s   d| j krt| | _| jS )z.
        :seealso: nltk.prob.FreqDist
        _vocab)r   r   r   r<   r!   r!   r"   r   i  s    

z
Text.vocabc                 C   s@   d| j krt| | _| j|}dd |D }tt|d dS )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        _token_searcherc                 S   s   g | ]}d  |qS r   )rS   r~   r!   r!   r"   r,     s     z Text.findall.<locals>.<listcomp>r   N)r   rv   r   r   rr   r   )r2   r   r   r!   r!   r"   r   r  s
    

zText.findallz\w+|[\.\!\?]c                 C   s   |d }|dkr*| j || s*|d8 }q|dkr:|| nd}|d }|t|k rl| j || sl|d7 }qF|t|kr|| nd}||fS )z
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        r   r   r   r   )_CONTEXT_REmatchr   )r2   r   r    jr   r   r!   r!   r"   _context  s    

zText._contextc                 C   s
   d| j  S Nz
<Text: %s>r   r<   r!   r!   r"   __str__  s    zText.__str__c                 C   s
   d| j  S r   r   r<   r!   r!   r"   rc     s    zText.__repr__)N)r   rq   )r   rq   )rB   re   )rB   re   )rB   )rB   )r   )r   Nr   )rW   rX   rY   rZ   r   r:   r   r   r   rn   r   r   r   r`   r   r   rV   r   r   r   r   r   r   r   compiler   r   r   rc   r!   r!   r!   r"   r   4  s0   



#

"


.	"
r   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )TextCollectiona;  A collection of texts, which can be loaded with list of texts, or
    with a corpus consisting of one or more texts, and which supports
    counting, concordancing, collocation discovery, etc.  Initialize a
    TextCollection as follows:

    >>> import nltk.corpus
    >>> from nltk.text import TextCollection
    >>> from nltk.book import text1, text2, text3
    >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
    >>> mytexts = TextCollection([text1, text2, text3])

    Iterating over a TextCollection produces all the tokens of all the
    texts in order.
    c                    s@   t  dr  fdd  D   | _t| t  i | _d S )NrL   c                    s   g | ]}  |qS r!   )rL   )r)   fsourcer!   r"   r,     s     z+TextCollection.__init__.<locals>.<listcomp>)r   Zfileids_textsr   r:   r   
_idf_cache)r2   r   r!   r   r"   r:     s
    
zTextCollection.__init__c                 C   s   | |t| S )z"The frequency of the term in text.)r   r   r2   termtextr!   r!   r"   tf  s    zTextCollection.tfc                    sj   | j  }|dkrft fdd| jD }t| jdkrBtd|rXtt| j| nd}|| j  < |S )zThe number of texts in the corpus divided by the
        number of texts that the term appears in.
        If a term does not appear in the corpus, 0.0 is returned.Nc                    s   g | ]} |krd qS )Tr!   )r)   r   r   r!   r"   r,     s      z&TextCollection.idf.<locals>.<listcomp>r   z+IDF undefined for empty document collectiong        )r   rF   r   r   rR   r   )r2   r   idfmatchesr!   r   r"   r     s    
zTextCollection.idfc                 C   s   |  ||| | S r$   )r   r   r   r!   r!   r"   tf_idf  s    zTextCollection.tf_idfN)rW   rX   rY   rZ   r:   r   r   r   r!   r!   r!   r"   r     s
   r   c                  C   s   ddl m}  t| jdd}t| t  td |d t  td |d t  td |  t  td |dd	d
dg t  td |	d t  td td|d  td|dd  td|
 d  d S )Nr   )brownnews)
categorieszConcordance:zDistributionally similar words:zCollocations:zDispersion plot:reportZsaidZ	announcedzVocabulary plot:2   z	Indexing:ztext[3]:r   z
text[3:5]:   ztext.vocab()['news']:)r   r   r   rL   rr   r   r   r   r   r   r   )r   r   r!   r!   r"   demo  s.    


r   __main__)&rZ   r   r   collectionsr   r   r   	functoolsr   mathr   Znltk.collocationsr   Znltk.lmr   Znltk.lm.preprocessingr	   Znltk.metricsr
   r   Znltk.probabilityr   r5   r   Znltk.tokenizer   Z	nltk.utilr   r   r   r   r\   rv   r   r   r   rW   __all__r!   r!   r!   r"   <module>	   sD   [{8   .
