
    Qdp                     |   d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ  edg d          Z G d d          Z G d d          Z G d d          Z  G d d          Z! G d de!          Z"d Z#e$dk    r
 e#             g dZ%dS )a  
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
    N)Counterdefaultdict
namedtuple)reduce)log)BigramCollocationFinder)MLE)padded_everygram_pipeline)BigramAssocMeasures	f_measure)ConditionalFreqDist)FreqDist)sent_tokenize)LazyConcatenation	tokenwrapConcordanceLine)leftqueryrightoffset
left_printright_printlinec                   T    e Zd ZdZed             Zddd fdZd Zd Zdd	Z	ddZ
dS )ContextIndexa  
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    c                     |dk    r| |dz
                                            nd}|t          |           dz
  k    r| |dz                                             nd}||fS )z;One left token and one right token, normalized to lowercaser      *START**END*)lowerlen)tokensir   r   s       )lib/python3.11/site-packages/nltk/text.py_default_contextzContextIndex._default_context.   sf     )*Qva!e}""$$$I)*c&kkAo)=)=q1u##%%%7e}    Nc                     | S N xs    r$   <lambda>zContextIndex.<lambda>5   s    Q r&   c                 ,    | _          _        |r| _        n j         _        rfdD             t	           fdt                    D                        _        t	           fdt                    D                        _        d S )Nc                 *    g | ]} |          |S r)   r)   ).0tfilters     r$   
<listcomp>z)ContextIndex.__init__.<locals>.<listcomp>=   s&    555A66!995a555r&   c              3   t   K   | ]2\  }}                     |                              |          fV  3d S r(   )_key_context_funcr/   r#   wselfr"   s      r$   	<genexpr>z(ContextIndex.__init__.<locals>.<genexpr>>   sW       %
 %
>BaTYYq\\4--fa889%
 %
 %
 %
 %
 %
r&   c              3   t   K   | ]2\  }}                     |                              |          fV  3d S r(   )r5   r4   r6   s      r$   r9   z(ContextIndex.__init__.<locals>.<genexpr>A   sW       %
 %
>BaT**DIIaLL9%
 %
 %
 %
 %
 %
r&   )r4   _tokensr5   r%   CFD	enumerate_word_to_contexts_context_to_words)r8   r"   context_funcr1   keys   `` ` r$   __init__zContextIndex.__init__5   s    	 	7!-D!%!6D 	65555555F!$ %
 %
 %
 %
 %
FOPVFWFW%
 %
 %
 "
 "
 "% %
 %
 %
 %
 %
FOPVFWFW%
 %
 %
 "
 "
r&   c                     | j         S )zw
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        r;   r8   s    r$   r"   zContextIndex.tokensE        |r&   c                     |                      |          }t          | j        |                   }i }| j                                        D ]%\  }}t	          |t          |                    ||<   &|S )z
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        )r4   setr>   itemsr   )r8   wordword_contextsscoresr7   
w_contextss         r$   word_similarity_dictz!ContextIndex.word_similarity_dictM   sq     yyD24899!399;; 	B 	BMAz!-ZAAF1IIr&      c                 D   t          t                    }| j        |                     |                   D ]M}| j        |         D ]=}||k    r5||xx         | j        |         |         | j        |         |         z  z  cc<   >Nt          ||j        d          d |         S )NT)rA   reverse)r   intr>   r4   r?   sortedget)r8   rJ   nrL   cr7   s         r$   similar_wordszContextIndex.similar_words\   s    S!!'		$8 	 	A+A.  991III.q1$7$:PQR:STU:VVIII
 f&*d;;;BQB??r&   Fc                 r     fdD              fdD             fdt          t                              D             }t          t          j                  |r%|r#t          dd                                        st                      S t           fdD                       }|S )a  
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        c                 :    g | ]}                     |          S r)   )r4   r/   r7   r8   s     r$   r2   z0ContextIndex.common_contexts.<locals>.<listcomp>q   s#    ---!1---r&   c                 D    g | ]}t          j        |                   S r)   )rH   r>   rZ   s     r$   r2   z0ContextIndex.common_contexts.<locals>.<listcomp>r   s)    BBBqC.q122BBBr&   c                 0    g | ]}|         
|         S r)   r)   )r/   r#   contextswordss     r$   r2   z0ContextIndex.common_contexts.<locals>.<listcomp>s   s&    HHHaHQKHqHHHr&   z%The following word(s) were not found: c              3   D   K   | ]}j         |         D ]
}|v |V  d S r(   )r>   )r/   r7   rV   commonr8   s      r$   r9   z/ContextIndex.common_contexts.<locals>.<genexpr>{   sI        $*@*C %&qF{{{{{{{ r&   )ranger!   r   rH   intersection
ValueErrorjoinr   )r8   r^   fail_on_unknownemptyfdra   r]   s   ``   @@r$   common_contextszContextIndex.common_contextsf   s     .---u---BBBBEBBBHHHHH5U#4#4HHH((33 		_ 		DchhuooVVV 	::          B Ir&   rO   )F)__name__
__module____qualname____doc__staticmethodr%   rB   r"   rN   rW   ri   r)   r&   r$   r   r   &   s            \ -1;; 
 
 
 
     @ @ @ @     r&   r   c                   @    e Zd ZdZd fdZd Zd Zd ZddZdd
Z	dS )ConcordanceIndexzs
    An index that can be used to look up the offset locations at which
    a given word occurs in a document.
    c                     | S r(   r)   r*   s    r$   r,   zConcordanceIndex.<lambda>   s    Q r&   c                     || _         	 || _        	 t          t                    | _        	 t          |          D ]:\  }}|                     |          }| j        |                             |           ;dS )a  
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        N)r;   r4   r   list_offsetsr=   append)r8   r"   rA   indexrJ   s        r$   rB   zConcordanceIndex.__init__   s     	  	D#D))L$V,, 	. 	.KE499T??DM$&&u----	. 	.r&   c                     | j         S )z{
        :rtype: list(str)
        :return: The document that this concordance index was
            created from.
        rD   rE   s    r$   r"   zConcordanceIndex.tokens   rF   r&   c                 F    |                      |          }| j        |         S )z
        :rtype: list(int)
        :return: A list of the offset positions at which the given
            word occurs.  If a key function was specified for the
            index, then given word's key will be looked up.
        )r4   ru   r8   rJ   s     r$   offsetszConcordanceIndex.offsets   s      yy}T""r&   c                 X    dt          | j                  t          | j                  fz  S )Nz+<ConcordanceIndex for %d tokens (%d types)>)r!   r;   ru   rE   s    r$   __repr__zConcordanceIndex.__repr__   s/    <@
 
 	
r&   P   c           
         t          |t                    r|}n|g}|t          d                    |                    z
  dz
  dz  }|dz  }g }|                     |d                   }t          |dd                   D ]H\  }fd|                     |          D             }t          |                    |                    }I|r|D ]d                    | j        t          |          z                      }	| j        t          d|z
                     }
| j        t          |          z   |z            }d                    |
          | d         }d                    |          d|         }d                    ||	|g          }t          |
|	||||          }|                    |           |S )z
        Find all concordance lines given the query word.

        Provided with a list of words, these will be found as a phrase.
        r_         r   r   Nc                      h | ]
}|z
  d z
  S )r   r)   )r/   r   r#   s     r$   	<setcomp>z4ConcordanceIndex.find_concordance.<locals>.<setcomp>   s!    LLLvFQJNLLLr&   )
isinstancert   r!   re   r{   r=   rS   rc   r;   maxr   rv   )r8   rJ   widthphrase
half_widthcontextconcordance_listr{   word_offsets
query_wordleft_contextright_contextr   r   
line_printconcordance_liner#   s                   @r$   find_concordancez!ConcordanceIndex.find_concordance   s    dD!! 	FFVFc#((6"2"2333a7A=
1* ,,vay)) ,, 	A 	AGAtLLLLd9K9KLLLL\66w??@@GG 	: : : XXdl1q3v;;3F&GHH
#|C1w;,?,?!,CD $QV_q7{-J K XXl33ZKLLA
!hh}55kzkB XXz:{&KLL
#2 !$ $  !''(89999r&      c                 <   |                      ||          }|st          d           dS t          |t          |                    }t          d| dt          |           d           t	          |d|                   D ]\  }}t          |j                   dS )a  
        Print concordance lines given the query word.
        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param lines: The number of lines to display (default=25)
        :type lines: int
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param save: The option to save the concordance.
        :type save: bool
        )r   z
no matcheszDisplaying z of z	 matches:N)r   printminr!   r=   r   )r8   rJ   r   linesr   r#   r   s          r$   print_concordancez"ConcordanceIndex.print_concordance   s      00U0CC 	-,s#34455EKKK3/?+@+@KKKLLL'01A&5&1I'J'J - -##&+,,,,- -r&   N)r~   )r~   r   )
rk   rl   rm   rn   rB   r"   r{   r}   r   r   r)   r&   r$   rq   rq      s         
 $/; . . . .4  # # #
 
 
*  *  *  * X- - - - - -r&   rq   c                       e Zd ZdZd Zd ZdS )TokenSearchera  
    A class that makes it easier to use regular expressions to search
    over tokenized strings.  The tokenized string is converted to a
    string where tokens are marked with angle brackets -- e.g.,
    ``'<the><window><is><still><open>'``.  The regular expression
    passed to the ``findall()`` method is modified to treat angle
    brackets as non-capturing parentheses, in addition to matching the
    token boundaries; and to have ``'.'`` not match the angle brackets.
    c                 N    d                     d |D                       | _        d S )N c              3   &   K   | ]}d |z   dz   V  dS )<>Nr)   )r/   r7   s     r$   r9   z)TokenSearcher.__init__.<locals>.<genexpr>  s*      ::aC!GcM::::::r&   )re   _raw)r8   r"   s     r$   rB   zTokenSearcher.__init__  s(    GG::6:::::			r&   c                 ~   t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        || j                  }|D ];}|                    d          s$|                    d          rt          d	          <d
 |D             }|S )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.text import TokenSearcher
        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        z\sr   r   z(?:<(?:r   z)>)z	(?<!\\)\.z[^>]z$Bad regexp for TokenSearcher.findallc                 H    g | ]}|d d                              d           S )r   z><splitr/   hs     r$   r2   z)TokenSearcher.findall.<locals>.<listcomp>0  s,    222!B$d##222r&   )resubfindallr   
startswithendswithrd   )r8   regexphitsr   s       r$   r   zTokenSearcher.findall
  s    0 r6**i00eV,,ff55 z&$),,  	I 	IA<<$$ IC I !GHHH 32T222r&   N)rk   rl   rm   rn   rB   r   r)   r&   r$   r   r      s<         ; ; ;' ' ' ' 'r&   r   c                       e Zd ZdZdZd!dZd Zd Zd"d	Zd"d
Z	d#dZ
d#dZd Zd Zd Zd$dZd$dZd Zd%dZd&dZd Zd Zd Z ej        d          Zd Zd Zd  ZdS )'Texta  
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    TNc                 b   | j         rt          |          }|| _        |r	|| _        dS d|dd         v rK|dd                             d          }d                    d |d|         D                       | _        dS d                    d |dd         D                       d	z   | _        dS )
zv
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        ]NrO   r_   c              3   4   K   | ]}t          |          V  d S r(   strr/   toks     r$   r9   z Text.__init__.<locals>.<genexpr>]  s(       C CcS C C C C C Cr&   r   c              3   4   K   | ]}t          |          V  d S r(   r   r   s     r$   r9   z Text.__init__.<locals>.<genexpr>_  s(       @ @cS @ @ @ @ @ @r&      z...)_COPY_TOKENSrt   r"   namerw   re   )r8   r"   r   ends       r$   rB   zText.__init__N  s      	"&\\F 	IDIIIF3B3K"+##C((C C CVAcE] C C CCCDIII @ @VBQBZ @ @ @@@5HDIIIr&   c                     | j         |         S r(   )r"   )r8   r#   s     r$   __getitem__zText.__getitem__e  s    {1~r&   c                 *    t          | j                  S r(   )r!   r"   rE   s    r$   __len__zText.__len__h  s    4;r&   O   r   c                     d| j         vrt          | j        d           | _        | j                            |||          S )a  
        Prints a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        _concordance_indexc                 *    |                                  S r(   r    ss    r$   r,   z"Text.concordance.<locals>.<lambda>      17799 r&   rA   )__dict__rq   r"   r   r   r8   rJ   r   r   s       r$   concordancezText.concordanceo  sP      t}44&6!4!4' ' 'D# &88ueLLLr&   c                     d| j         vrt          | j        d           | _        | j                            ||          d|         S )a  
        Generate a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        r   c                 *    |                                  S r(   r   r   s    r$   r,   z'Text.concordance_list.<locals>.<lambda>  r   r&   r   N)r   rq   r"   r   r   r   s       r$   r   zText.concordance_list  sW      t}44&6!4!4' ' 'D# &77eDDVeVLLr&   rO   r   c                    d| j         v r| j        |k    r| j        |k    s|| _        || _        ddlm} |                    d          t          j        | j        |          }|	                    d           |
                    fd           t                      }t          |                    |j        |                    | _        | j        S )a  
        Return collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocation_list()[:2]
            [('United', 'States'), ('fellow', 'citizens')]

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        :rtype: list(tuple(str, str))
        _collocationsr   )	stopwordsenglishr   c                 V    t          |           dk     p|                                 v S )N   )r!   r    )r7   ignored_wordss    r$   r,   z'Text.collocation_list.<locals>.<lambda>  s#    s1vvz/WQWWYY-=W r&   )r   _num_window_sizenltk.corpusr   r^   r   
from_wordsr"   apply_freq_filterapply_word_filterr   rt   nbestlikelihood_ratior   )r8   numwindow_sizer   finderbigram_measuresr   s         @r$   collocation_listzText.collocation_list  s     t},,	S  ![00DI +D .-----%OOI66M,7[QQF$$Q'''$$%W%W%W%WXXX133O!%_=sCC" "D !!r&   c                     d |                      ||          D             }t          t          |d                     dS )a  
        Print collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE
            United States; fellow citizens; years ago; four years; Federal
            Government; General Government; American people; Vice President; God
            bless; Chief Justice; one another; fellow Americans; Old World;
            Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian
            tribes; public debt; foreign nations


        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        c                 $    g | ]\  }}|d z   |z   S r_   r)   r/   w1w2s      r$   r2   z%Text.collocations.<locals>.<listcomp>  s1     
 
 
$b"BHrM
 
 
r&   ; )	separatorN)r   r   r   )r8   r   r   collocation_stringss       r$   collocationszText.collocations  sU    &
 
(,(=(=c;(O(O
 
 
 	i+t<<<=====r&   c                 6    | j                             |          S )zJ
        Count the number of times this word appears in the text.
        )r"   countrz   s     r$   r   z
Text.count       {  &&&r&   c                 6    | j                             |          S )zQ
        Find the index of the first occurrence of the word in the text.
        )r"   rw   rz   s     r$   rw   z
Text.index  r   r&   c                     t           r(   )NotImplementedError)r8   methods     r$   readabilityzText.readability  s    !!r&   c                    d| j         vrt          | j        d d           | _                                        | j        j                                        v rt                             t          fd                                D                       }d |	                    |          D             }t          t          |                     dS t          d           dS )	a~  
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        _word_context_indexc                 *    |                                  S r(   )isalphar*   s    r$   r,   zText.similar.<locals>.<lambda>  s    aiikk r&   c                 *    |                                  S r(   r   r   s    r$   r,   zText.similar.<locals>.<lambda>  s     r&   )r1   rA   c              3   F   K   | ]}|         D ]}|v |k    |V  d S r(   r)   )r/   r7   rV   r]   wcirJ   s      r$   r9   zText.similar.<locals>.<genexpr>  sW        Q  ==d  *3	 r&   c                     g | ]\  }}|S r)   r)   r/   r7   _s      r$   r2   z Text.similar.<locals>.<listcomp>   s    77741aQ777r&   z
No matchesN)r   r   r"   r   r    r>   
conditionsrH   r   most_commonr   r   )r8   rJ   r   rh   r^   r]   r   s    `   @@r$   similarzText.similar  s    !55'3$9$9?R?R( ( (D$ zz||&83>>####3t9~~H      ))    B 872>>##6#6777E)E""#####,r&   c                    d| j         vrt          | j        d           | _        	 | j                            |d          }|st          d           dS d |                    |          D             }t          t          d |D                                  dS # t          $ r}t          |           Y d}~dS d}~ww xY w)	aY  
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param words: The words used to seed the similarity search
        :type words: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        r   c                 *    |                                  S r(   r   r   s    r$   r,   z&Text.common_contexts.<locals>.<lambda>  r   r&   r   TzNo common contexts were foundc                     g | ]\  }}|S r)   r)   r   s      r$   r2   z(Text.common_contexts.<locals>.<listcomp>  s    "E"E"EA1"E"E"Er&   c              3   ,   K   | ]\  }}|d z   |z   V  dS )r   Nr)   r   s      r$   r9   z'Text.common_contexts.<locals>.<genexpr>  s.      LL&"bS2LLLLLLr&   N)	r   r   r"   r   ri   r   r   r   rd   )r8   r^   r   rh   ranked_contextses         r$   ri   zText.common_contexts  s     !55'3!4!4( ( (D$		)99%FFB N566666"E"E1D1D"E"E"EiLLOLLLLLMMMMM 	 	 	!HHHHHHHHH	s   ,B AB 
C &B;;C c                 *    ddl m}  || |           dS )z
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        r   )dispersion_plotN)	nltk.drawr  )r8   r^   r  s      r$   r  zText.dispersion_plot!  s.     	.-----e$$$$$r&   r   c                 x    t          ||          \  }}t          |          }|                    ||           |S )N)order)r
   r	   fit)r8   tokenized_sentsrU   
train_datapadded_sentsmodels         r$   _train_default_ngram_lmzText._train_default_ngram_lm.  s<    #<Q#P#P 
L!		*l+++r&   d   *   c                    d t          d                    | j                            D             | _        t	          | d          s<t          dt          j                   |                     | j        d          | _	        g }|dk    s
J d	            t          |          |k     rlt          | j	                            |||
                    D ])\  }}|dk    r|dk    r n|                    |           *|dz  }t          |          |k     l|rd                    |          dz   nd}|t          |d|                   z   }t          |           |S )a  
        Print random text, generated using a trigram language model.
        See also `help(nltk.lm)`.

        :param length: The length of text to generate (default=100)
        :type length: int

        :param text_seed: Generation can be conditioned on preceding context.
        :type text_seed: list(str)

        :param random_seed: A random seed or an instance of `random.Random`. If provided,
            makes the random sampling part of generation reproducible. (default=42)
        :type random_seed: int
        c                 8    g | ]}|                     d           S r   r   )r/   sents     r$   r2   z!Text.generate.<locals>.<listcomp>D  s/     !
 !
 !
 $DJJsOO!
 !
 !
r&   r_   _trigram_modelzBuilding ngram index...)filer   )rU   r   z!The `length` must be more than 0.)	text_seedrandom_seedz<s>z</s>r   r   N)r   re   r"   _tokenized_sentshasattrr   sysstderrr  r  r!   r=   generaterv   r   )	r8   lengthr  r  generated_tokensidxtokenprefix
output_strs	            r$   r  zText.generate4  s    !
 !
(5chht{6K6K(L(L!
 !
 !
 t-.. 	+#*===="&">">% #? # #D zzz>zzz"##f,,'#,,i[ -    	/ 	/
U
 E>>F??E ''....1K "##f,, /8?)$$s**Ri(8&(ABBB
jr&   c                 :     |                                  j        | S )zc
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        )vocabplot)r8   argss     r$   r(  z	Text.plotb  s    
 !tzz|| $''r&   c                 J    d| j         vrt          |           | _        | j        S )z.
        :seealso: nltk.prob.FreqDist
        _vocab)r   r   r+  rE   s    r$   r'  z
Text.vocabi  s&     4=(("4..DK{r&   c                     d| j         vrt          |           | _        | j                            |          }d |D             }t	          t          |d                     dS )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        _token_searcherc                 8    g | ]}d                      |          S r   )re   r   s     r$   r2   z Text.findall.<locals>.<listcomp>  s"    ******r&   r   N)r   r   r-  r   r   r   )r8   r   r   s      r$   r   zText.findallr  sh    . DM11#0#6#6D #++F33**T***id##$$$$$r&   z\w+|[\.\!\?]c                    |dz
  }|dk    rK| j                             ||                   s+|dz  }|dk    r | j                             ||                   +|dk    r||         nd}|dz   }|t          |          k     rX| j                             ||                   s8|dz  }|t          |          k     r | j                             ||                   8|t          |          k    r||         nd}||fS )z
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        r   r   r   r   )_CONTEXT_REmatchr!   )r8   r"   r#   jr   r   s         r$   _contextzText._context  s     E1ffT-33F1I>>fFA 1ffT-33F1I>>fFFvayy	 E#f++ood&6&<&<VAY&G&GoFA #f++ood&6&<&<VAY&G&Go#f++--q		7e}r&   c                     d| j         z  S Nz
<Text: %s>r   rE   s    r$   __str__zText.__str__      di''r&   c                     d| j         z  S r5  r6  rE   s    r$   r}   zText.__repr__  r8  r&   r(   )r   r   )rO   r   rj   )r   )r  Nr  )rk   rl   rm   rn   r   rB   r   r   r   r   r   r   r   rw   r   r   ri   r  r  r  r(  r'  r   r   compiler0  r3  r7  r}   r)   r&   r$   r   r   4  s        . LI I I I.       M M M M*M M M M(!" !" !" !"F> > > >0' ' '' ' '" " "           D   8% % %   , , , ,\( ( (  % % %D "*_--K  0( ( (( ( ( ( (r&   r   c                   *    e Zd ZdZd Zd Zd Zd ZdS )TextCollectiona;  A collection of texts, which can be loaded with list of texts, or
    with a corpus consisting of one or more texts, and which supports
    counting, concordancing, collocation discovery, etc.  Initialize a
    TextCollection as follows:

    >>> import nltk.corpus
    >>> from nltk.text import TextCollection
    >>> from nltk.book import text1, text2, text3
    >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
    >>> mytexts = TextCollection([text1, text2, text3])

    Iterating over a TextCollection produces all the tokens of all the
    texts in order.
    c                     t          d          r fd                                D             | _        t                              | t                               i | _        d S )Nr^   c                 :    g | ]}                     |          S r)   )r^   )r/   fsources     r$   r2   z+TextCollection.__init__.<locals>.<listcomp>  s#    @@@!fll1oo@@@r&   )r  fileids_textsr   rB   r   
_idf_cache)r8   r@  s    `r$   rB   zTextCollection.__init__  sh    67## 	A@@@@v~~/?/?@@@Fd-f55666r&   c                 L    |                     |          t          |          z  S )z"The frequency of the term in text.)r   r!   r8   termtexts      r$   tfzTextCollection.tf  s    zz$#d))++r&   c                 2   | j                                       }|yt          fd| j        D                       }t          | j                  dk    rt	          d          |r$t          t          | j                  |z            nd}|| j         <   |S )zThe number of texts in the corpus divided by the
        number of texts that the term appears in.
        If a term does not appear in the corpus, 0.0 is returned.Nc                     g | ]}|v d 	S )Tr)   )r/   rG  rF  s     r$   r2   z&TextCollection.idf.<locals>.<listcomp>  s    HHHD44<<4<<<r&   r   z+IDF undefined for empty document collectiong        )rC  rT   r!   rB  rd   r   )r8   rF  idfmatchess    `  r$   rK  zTextCollection.idf  s    
 o!!$'';HHHHDKHHHIIG4;1$$ !NOOO5<E#c$+&&0111#C$'DOD!
r&   c                 Z    |                      ||          |                     |          z  S r(   )rH  rK  rE  s      r$   tf_idfzTextCollection.tf_idf  s%    wwtT""TXXd^^33r&   N)rk   rl   rm   rn   rB   rH  rK  rN  r)   r&   r$   r<  r<    sZ           , , ,  4 4 4 4 4r&   r<  c                  R   ddl m}  t          |                     d                    }t	          |           t	                       t	          d           |                    d           t	                       t	          d           |                    d           t	                       t	          d           |                                 t	                       t	          d           |                    g d	           t	                       t	          d
           |	                    d           t	                       t	          d           t	          d|d                    t	          d|dd                    t	          d|
                                d                    d S )Nr   )brownnews)
categorieszConcordance:zDistributionally similar words:zCollocations:zDispersion plot:)rQ  reportsaid	announcedzVocabulary plot:2   z	Indexing:ztext[3]:r   z
text[3:5]:   ztext.vocab()['news']:)r   rP  r   r^   r   r   r   r   r  r(  r'  )rP  rG  s     r$   demorX    s{   !!!!!!v..//D	$KKK	GGG	.V	GGG	
+,,,LL	GGG	/	GGG 

@@@AAA	GGG	
IIbMMM	GGG	+	*d1g	,QqS	"""	
!4::<<#788888r&   __main__)r   rq   r   r   r<  )&rn   r   r  collectionsr   r   r   	functoolsr   mathr   nltk.collocationsr   nltk.lmr	   nltk.lm.preprocessingr
   nltk.metricsr   r   nltk.probabilityr   r<   r   nltk.tokenizer   	nltk.utilr   r   r   r   rq   r   r   r<  rX  rk   __all__r)   r&   r$   <module>re     sA    
			 



 8 8 8 8 8 8 8 8 8 8             5 5 5 5 5 5       ; ; ; ; ; ; 7 7 7 7 7 7 7 7 7 7 7 7 7 7 % % % % % % ' ' ' ' ' ' 2 2 2 2 2 2 2 2*MMM X X X X X X X Xvx- x- x- x- x- x- x- x-v5 5 5 5 5 5 5 5p~( ~( ~( ~( ~( ~( ~( ~(D+4 +4 +4 +4 +4T +4 +4 +4\9 9 9< zDFFF  r&   