
    c8                     F   d Z ddlZddlZddlZddlmZ ddlmZ  eg d          Z	 ej
        d ej        ej                  z  ej                  Z ej
        dej                  Z ej
        dej                  Z ej
        d	ej                  Z ej
        d
ej                  Z ej
        dej                  Z ej
        dej                  Zd#dZd#dZd Zd Zd$dZd$dZd Zd Zd Zd Zd ZeZ d%dZ!d Z"d eeeeeeegZ#e#fdZ$d  Z%d! Z&d" Z'dS )&a  This module contains methods for parsing and preprocessing strings.

Examples
--------

.. sourcecode:: pycon

    >>> from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
    >>> remove_stopwords("Better late than never, but better never late.")
    u'Better late never, better late.'
    >>>
    >>> preprocess_string("<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     weather_is really g00d today, isn't it?")
    [u'hel', u'rld', u'weather', u'todai', u'isn']

    N)utils)PorterStemmer(Q  allsixjustlessbeingindeedovermoveanywayfournotownthroughusingfiftywheremillonlyfindbeforeonewhosesystemhow	somewheremuchthickshowhadenoughshouldtomustwhomseeming
yourselvesunderourstwohasmight
thereafterlatterlydothemhisaroundthangetverydenonecannoteveryuntheyfrontduringthusnowhimnorname	regardingseveral	hereafterdidalwayswhodidnwhitherthissomeoneeithereachbecome	thereuponsometimesidetowardsthereintwelvebecauseoftentenourdoingkmegsomebackusedupgonamelycomputerarefurtherbeyond	ourselvesyetoutevenwillwhatstillforbottomminesincepleasefortyperits
everythingbehinddoesvariousabovebetweenitneitherseemedeveracrossshesomehowbewefullneversixtyhoweverhere	otherwisewere	whereuponnowherealthoughfoundalonerealongquitefifteenbybothaboutlastwouldanythingviamanycouldthenceputagainstkeepetcamountbecameltdhenceontoorconamongalreadyco
afterwardsformerlywithinseemsintootherswhilewhateverexceptdownherseveryonedoneleastanotherwhoevermoreovercouldnt
throughoutanyhowyourselfthreefromherfewtogethertopthereduebeennextanyoneelevencrycall	thereforeinterestthenthru
themselveshundredreallysincereemptymorehimself	elsewheremostlyonfireambecomingherebyamongstelsepart
everywheretookgherselfformerthosehememyselfmadetwentythesewasbillcantusuntilbesidesneverthelessbelowanywhereninecanwhetherofyourtowardmysay	somethingand
whereafterwhenevergivealmostwhereverisdescribe
beforehandhereindoesnanasitselfathaveinseemwhenceieanyfillagainhasntinctherebythinnoperhapslatter	meanwhilewhendetailsamewhereinbesidealsothatothertakewhichbecomesyouifnobodyunlesswhereasseethoughmayafteruponmosthereuponeightbutseriousnothingsuchwhyoffadonwherebythirdiwholenoone	sometimeswellamoungstyourstheirratherwithoutsofivethefirstwithmakeoncez([%s])+z	<([^>]+)>z[0-9]+z\Wz([a-z]+)([0-9]+))flagsz([0-9]+)([a-z]+)z(\s)+c                     t          j        |           } d                    t          |                                 |                    S )aF  Remove :const:`~gensim.parsing.preprocessing.STOPWORDS` from `s`.

    Parameters
    ----------
    s : str
    stopwords : iterable of str, optional
        Sequence of stopwords
        If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS`

    Returns
    -------
    str
        Unicode string without `stopwords`.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import remove_stopwords
        >>> remove_stopwords("Better late than never, but better never late.")
        u'Better late never, better late.'

     )r   
to_unicodejoinremove_stopword_tokenssplit)s	stopwordss     <lib/python3.11/site-packages/gensim/parsing/preprocessing.pyremove_stopwordsr`  G   s9    0 	A88*17799i@@AAA    c                 2    t           fd| D             S )ar  Remove stopword tokens using list `stopwords`.

    Parameters
    ----------
    tokens : iterable of str
        Sequence of tokens.
    stopwords : iterable of str, optional
        Sequence of stopwords
        If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS`

    Returns
    -------
    list of str
        List of tokens without `stopwords`.

    Nc                     g | ]}|v|	S  rd  ).0tokenr^  s     r_  
<listcomp>z*remove_stopword_tokens.<locals>.<listcomp>v   s#    @@@ei)?@E@@@ra  )	STOPWORDS)tokensr^  s    `r_  r[  r[  c   s,    "  	@@@@v@@@@ra  c                 `    t          j        |           } t                              d|           S )a<  Replace ASCII punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`.

    Parameters
    ----------
    s : str

    Returns
    -------
    str
        Unicode string without punctuation characters.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import strip_punctuation
        >>> strip_punctuation("A semicolon is a stronger break than a comma, but not as much as a full stop!")
        u'A semicolon is a stronger break than a comma  but not as much as a full stop '

    rX  )r   rY  RE_PUNCTsubr]  s    r_  strip_punctuationrn  y   s(    * 	A<<Qra  c                 `    t          j        |           } t                              d|           S )a  Remove tags from `s` using :const:`~gensim.parsing.preprocessing.RE_TAGS`.

    Parameters
    ----------
    s : str

    Returns
    -------
    str
        Unicode string without tags.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import strip_tags
        >>> strip_tags("<i>Hello</i> <b>World</b>!")
        u'Hello World!'

     )r   rY  RE_TAGSrl  rm  s    r_  
strip_tagsrr     s(    * 	A;;r1ra     c                     t          j        |           } d                    t          |                                 |                    S )a  Remove words with length lesser than `minsize` from `s`.

    Parameters
    ----------
    s : str
    minsize : int, optional

    Returns
    -------
    str
        Unicode string without short words.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import strip_short
        >>> strip_short("salut les amis du 59")
        u'salut les amis'
        >>>
        >>> strip_short("one two three four five six seven eight nine ten", minsize=5)
        u'three seven eight'

    rX  )r   rY  rZ  remove_short_tokensr\  )r]  minsizes     r_  strip_shortrw     s9    2 	A88'		7;;<<<ra  c                      fd| D             S )a$  Remove tokens shorter than `minsize` chars.

    Parameters
    ----------
    tokens : iterable of str
        Sequence of tokens.
    minsize : int, optimal
        Minimal length of token (include).

    Returns
    -------
    list of str
        List of tokens without short tokens.
    c                 :    g | ]}t          |          k    |S rd  )len)re  rf  rv  s     r_  rg  z'remove_short_tokens.<locals>.<listcomp>   s)    ???eUw)>?E???ra  rd  )ri  rv  s    `r_  ru  ru     s       @???v????ra  c                 `    t          j        |           } t                              d|           S )a  Remove digits from `s` using :const:`~gensim.parsing.preprocessing.RE_NUMERIC`.

    Parameters
    ----------
    s : str

    Returns
    -------
    str
        Unicode  string without digits.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import strip_numeric
        >>> strip_numeric("0text24gensim365test")
        u'textgensimtest'

    rp  )r   rY  
RE_NUMERICrl  rm  s    r_  strip_numericr}     s(    * 	A>>"a   ra  c                 `    t          j        |           } t                              d|           S )a6  Remove non-alphabetic characters from `s` using :const:`~gensim.parsing.preprocessing.RE_NONALPHA`.

    Parameters
    ----------
    s : str

    Returns
    -------
    str
        Unicode string with alphabetic characters only.

    Notes
    -----
    Word characters - alphanumeric & underscore.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import strip_non_alphanum
        >>> strip_non_alphanum("if-you#can%read$this&then@this#method^works")
        u'if you can read this then this method works'

    rX  )r   rY  RE_NONALPHArl  rm  s    r_  strip_non_alphanumr     s(    2 	A??3"""ra  c                 `    t          j        |           } t                              d|           S )aP  Remove repeating whitespace characters (spaces, tabs, line breaks) from `s`
    and turns tabs & line breaks into spaces using :const:`~gensim.parsing.preprocessing.RE_WHITESPACE`.

    Parameters
    ----------
    s : str

    Returns
    -------
    str
        Unicode string without repeating in a row whitespace characters.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import strip_multiple_whitespaces
        >>> strip_multiple_whitespaces("salut" + '\r' + " les" + '\n' + "         loulous!")
        u'salut les loulous!'

    rX  )r   rY  RE_WHITESPACErl  rm  s    r_  strip_multiple_whitespacesr    s*    , 	AS!$$$ra  c                     t          j        |           } t                              d|           } t                              d|           S )a  Add spaces between digits & letters in `s` using :const:`~gensim.parsing.preprocessing.RE_AL_NUM`.

    Parameters
    ----------
    s : str

    Returns
    -------
    str
        Unicode string with spaces between digits & letters.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import split_alphanum
        >>> split_alphanum("24.0hours7 days365 a1b2c3")
        u'24.0 hours 7 days 365 a 1 b 2 c 3'

    z\1 \2)r   rY  	RE_AL_NUMrl  	RE_NUM_ALrm  s    r_  split_alphanumr  ,  s;    * 	Ah""A==1%%%ra  c                     t          j        |           } t                      d                    fd|                                 D                       S )a  Transform `s` into lowercase and stem it.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        Unicode lowercased and porter-stemmed version of string `text`.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import stem_text
        >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.")
        u'while it is quit us to be abl to search a larg collect of document almost instantly.'

    rX  c              3   B   K   | ]}                     |          V  d S N)stem)re  wordps     r_  	<genexpr>zstem_text.<locals>.<genexpr>]  s-      ::TAFF4LL::::::ra  )r   rY  r   rZ  r\  )textr  s    @r_  	stem_textr  F  sM    * D!!DA88::::TZZ\\::::::ra  utf8strictc                 R    t          j        |                                 ||          S )a  Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`.

    Parameters
    ----------
    text : str
        Input text.
    encoding : str, optional
        Encoding that will be used for conversion.
    errors : str, optional
        Error handling behaviour, used as parameter for `unicode` function (python2 only).

    Returns
    -------
    str
        Unicode version of `text`.

    See Also
    --------
    :func:`gensim.utils.any2unicode`
        Convert any string to unicode-string.

    )r   rY  lower)r  encodingerrorss      r_  lower_to_unicoder  c  s!    . DJJLL(F;;;ra  c                     d t          j        |                                                               d          D             S )zSplit line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`.

    Parameters
    ----------
    s : str
        Some line.

    Returns
    -------
    list of str
        List of tokens from `s`.

    c                     g | ]}||S rd  rd  )re  r  s     r_  rg  z"split_on_space.<locals>.<listcomp>  s    LLLTtLDLLLra  rX  )r   rY  stripr\  rm  s    r_  split_on_spacer  }  s>     MLU-a006688>>sCCLLLLra  c                 *    |                                  S r  )r  )xs    r_  <lambda>r    s    aggii ra  c                 r    t          j        |           } |D ]} ||           } |                                 S )a  Apply list of chosen filters to `s`.

    Default list of filters:

    * :func:`~gensim.parsing.preprocessing.strip_tags`,
    * :func:`~gensim.parsing.preprocessing.strip_punctuation`,
    * :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`,
    * :func:`~gensim.parsing.preprocessing.strip_numeric`,
    * :func:`~gensim.parsing.preprocessing.remove_stopwords`,
    * :func:`~gensim.parsing.preprocessing.strip_short`,
    * :func:`~gensim.parsing.preprocessing.stem_text`.

    Parameters
    ----------
    s : str
    filters: list of functions, optional

    Returns
    -------
    list of str
        Processed strings (cleaned).

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import preprocess_string
        >>> preprocess_string("<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     weather_is really g00d today, isn't it?")
        [u'hel', u'rld', u'weather', u'todai', u'isn']
        >>>
        >>> s = "<i>Hel 9lo</i> <b>Wo9 rld</b>! Th3     weather_is really g00d today, isn't it?"
        >>> CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation]
        >>> preprocess_string(s, CUSTOM_FILTERS)
        [u'hel', u'9lo', u'wo9', u'rld', u'th3', u'weather', u'is', u'really', u'g00d', u'today', u'isn', u't', u'it']

    )r   rY  r\  )r]  filtersfs      r_  preprocess_stringr    sA    J 	A  AaDD7799ra  c                     d | D             S )a'  Apply :const:`~gensim.parsing.preprocessing.DEFAULT_FILTERS` to the documents strings.

    Parameters
    ----------
    docs : list of str

    Returns
    -------
    list of list of str
        Processed documents split by whitespace.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.parsing.preprocessing import preprocess_documents
        >>> preprocess_documents(["<i>Hel 9lo</i> <b>Wo9 rld</b>!", "Th3     weather_is really g00d today, isn't it?"])
        [[u'hel', u'rld'], [u'weather', u'todai', u'isn']]

    c                 ,    g | ]}t          |          S rd  )r  )re  ds     r_  rg  z(preprocess_documents.<locals>.<listcomp>  s!    ///Qa  ///ra  rd  )docss    r_  preprocess_documentsr    s    * 0/$////ra  c                     t          j        | d          5 }|                                cd d d            S # 1 swxY w Y   d S )Nrb)r   openread)pathfins     r_  	read_filer    s    	D$		 3xxzz                 s   7;;c                 >    d t          j         |           D             S )Nc                 ,    g | ]}t          |          S rd  )r  )re  fnames     r_  rg  zread_files.<locals>.<listcomp>  s     ===Ie===ra  )glob)patterns    r_  
read_filesr    s     ==$)G*<*<====ra  r  )rs  )r  r  )(__doc__r   stringr  gensimr   gensim.parsing.porterr   	frozensetrh  compileescapepunctuationUNICODErk  rq  r|  r  r  r  r  r`  r[  rn  rr  rw  ru  r}  r  r  r  r  r  r  r  DEFAULT_FILTERSr  r  r  r  rd  ra  r_  <module>r     s     
			         / / / / / / I     	@ 2:j929V-?#@#@@"*MM
"*\2:
.
.RZ	2:..
bj
++BJ*"*===	BJ*"*===	
8RZ00B B B B8A A A A,     4  2= = = =:@ @ @ @&! ! !2# # #:% % %4& & &4; ; ;4 < < < <4M M M$ %6k9 "1 ( ( ( (V0 0 00  
> > > > >ra  