
    c<                        d Z  G d d          Zedk    rgddlZ e            Zej        dd         D ]KZ ee          5 ZeD ] Z	 e
e                    e	                     !	 ddd           n# 1 swxY w Y   JdS dS )aa  Porter Stemming Algorithm
This is the Porter stemming algorithm, ported to Python from the
version coded up in ANSI C by the author. It may be be regarded
as canonical, in that it follows the algorithm presented in [1]_, see also [2]_

Author - Vivake Gupta (v@nano.com), optimizations and cleanup of the code by Lars Buitinck.

Examples
--------

.. sourcecode:: pycon

    >>> from gensim.parsing.porter import PorterStemmer
    >>>
    >>> p = PorterStemmer()
    >>> p.stem("apple")
    'appl'
    >>>
    >>> p.stem_sentence("Cats and ponies have meeting")
    'cat and poni have meet'
    >>>
    >>> p.stem_documents(["Cats and ponies", "have meeting"])
    ['cat and poni', 'have meet']

.. [1] Porter, 1980, An algorithm for suffix stripping, http://www.cs.odu.edu/~jbollen/IR04/readings/readings5.pdf
.. [2] http://www.tartarus.org/~martin/PorterStemmer

c                   ~    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd ZdS )PorterStemmera1  Class contains implementation of Porter stemming algorithm.

    Attributes
    --------
    b : str
        Buffer holding a word to be stemmed. The letters are in b[0], b[1] ... ending at b[`k`].
    k : int
        Readjusted downwards as the stemming progresses.
    j : int
        Word length.

    c                 0    d| _         d| _        d| _        d S )N     )bkjselfs    5lib/python3.11/site-packages/gensim/parsing/porter.py__init__zPorterStemmer.__init__.   s        c                 v    | j         |         }|dv rdS |dk    r|dk    p|                     |dz
             S dS )a  Check if b[i] is a consonant letter.

        Parameters
        ----------
        i : int
            Index for `b`.

        Returns
        -------
        bool

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.b = "hi"
            >>> p._cons(1)
            False
            >>> p.b = "meow"
            >>> p._cons(3)
            True

        aeiouFyr      Tr   _cons)r   ichs      r   r   zPorterStemmer._cons3   sS    4 VAY= 	59 	362AE!2!222tr   c                 2   d}	 || j         k    rdS |                     |          sn|dz  })|dz  }d}	 	 || j         k    r|S |                     |          rn|dz  })|dz  }|dz  }	 || j         k    r|S |                     |          sn|dz  })|dz  }d)a  Calculate the number of consonant sequences between 0 and j.

        If c is a consonant sequence and v a vowel sequence, and <..>
        indicates arbitrary presence,

           <c><v>       gives 0
           <c>vc<v>     gives 1
           <c>vcvc<v>   gives 2
           <c>vcvcvc<v> gives 3

        Returns
        -------
        int
            The number of consonant sequences between 0 and j.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.b = "<bm>aobm<ao>"
            >>> p.j = 11
            >>> p._m()
            2

        r   Tr   )r	   r   )r   r   ns      r   _mzPorterStemmer._mT   s    8 	46z q::a== FA	 	
Q	tv: H::a== Q FAFAtv: Hzz!}} Q FA	r   c                 f     t           fdt           j        dz             D                        S )a  Check if b[0: j + 1] contains a vowel letter.

        Returns
        -------
        bool

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.b = "gnsm"
            >>> p.j = 3
            >>> p._vowelinstem()
            False
            >>> p.b = "gensim"
            >>> p.j = 5
            >>> p._vowelinstem()
            True

        c              3   B   K   | ]}                     |          V  d S N)r   ).0r   r   s     r   	<genexpr>z-PorterStemmer._vowelinstem.<locals>.<genexpr>   s-      @@tzz!}}@@@@@@r   r   )allranger	   r
   s   `r   _vowelinstemzPorterStemmer._vowelinstem   s9    . @@@@eDFQJ.?.?@@@@@@@r   c                 v    |dk    o3| j         |         | j         |dz
           k    o|                     |          S )a5  Check if b[j - 1: j + 1] contain a double consonant letter.

        Parameters
        ----------
        j : int
            Index for `b`

        Returns
        -------
        bool

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.b = "real"
            >>> p.j = 3
            >>> p._doublec(3)
            False
            >>> p.b = "really"
            >>> p.j = 5
            >>> p._doublec(4)
            True

        r   r   r   )r   r	   s     r   _doubleczPorterStemmer._doublec   s7    8 1uEdfQUm3E

1Er   c                     |dk     sE|                      |          r0|                      |dz
            s|                      |dz
            sdS | j        |         dvS )a[  Check if b[j - 2: j + 1] makes the (consonant, vowel, consonant) pattern and also
        if the second 'c' is not 'w', 'x' or 'y'. This is used when trying to restore an 'e' at the end of a short word,
        e.g. cav(e), lov(e), hop(e), crim(e), but snow, box, tray.

        Parameters
        ----------
        i : int
            Index for `b`

        Returns
        -------
        bool

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.b = "lib"
            >>> p.j = 2
            >>> p._cvc(2)
            True
            >>> p.b = "dll"
            >>> p.j = 2
            >>> p._cvc(2)
            False
            >>> p.b = "wow"
            >>> p.j = 2
            >>> p._cvc(2)
            False

           r   Fwxy)r   r   )r   r   s     r   _cvczPorterStemmer._cvc   sg    D q5 	

1 	AE):): 	$**QQRUBSBS 	5vay%%r   c                     |d         | j         | j                 k    rdS t          |          }|| j        dz   k    rdS | j         | j        |z
  dz   | j        dz            |k    rdS | j        |z
  | _        dS )a  Check if b[: k + 1] ends with `s`.

        Parameters
        ----------
        s : str

        Returns
        -------
        bool

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.b = "cowboy"
            >>> p.j = 5
            >>> p.k = 2
            >>> p._ends("cow")
            True

        Fr   T)r   r   lenr	   )r   slengths      r   _endszPorterStemmer._ends   s    0 R5DF46N" 	5QTVaZ  	56$&6/A%dfqj01Q6 	5&tr   c                 |    | j         d| j        dz            |z   | _         t          | j                   dz
  | _        dS )zbAppend `s` to `b`, adjusting `k`.

        Parameters
        ----------
        s : str

        Nr   )r   r	   r*   r   r   r+   s     r   _settozPorterStemmer._setto	  s8     !$q(TVqr   c                 d    |                                  dk    r|                     |           d S d S )Nr   )r   r0   r/   s     r   _rzPorterStemmer._r  s3    7799q= 	KKNNNNN	 	r   c                 p   | j         | j                 dk    rz|                     d          r| xj        dz  c_        nT|                     d          r|                     d           n)| j         | j        dz
           dk    r| xj        dz  c_        |                     d          r,|                                 dk    r| xj        dz  c_        dS dS |                     d	          s|                     d
          r5|                                 r| j        | _        |                     d          r|                     d           dS |                     d          r|                     d           dS |                     d          r|                     d           dS |                     | j                  r+| j         | j        dz
           dvr| xj        dz  c_        dS dS |                                 dk    r5|                     | j                  r|                     d           dS dS dS dS dS )a  Get rid of plurals and -ed or -ing.

           caresses  ->  caress
           ponies    ->  poni
           ties      ->  ti
           caress    ->  caress
           cats      ->  cat

           feed      ->  feed
           agreed    ->  agree
           disabled  ->  disable

           matting   ->  mat
           mating    ->  mate
           meeting   ->  meet
           milling   ->  mill
           messing   ->  mess

           meetings  ->  meet

        r+   ssesr%   iesr   r   eedr   edingatateblbleizizelszeN)	r   r   r-   r0   r   r!   r	   r#   r'   r
   s    r   _step1abzPorterStemmer._step1ab  sR   , 6$&>S  	zz&!! !E"" C    
#s* !::e 	!wwyy1} ! jj 	!$**U"3"3 	!9J9J9L9L 	!VDFzz$ 
!E"""""D!! !E"""""D!! !E"""""tv&& !6$&1*%U2  FFaKFFFF   a !DIIdf$5$5 !C     	! 	! 	! 	!! ! ! !r   c                     |                      d          r2|                                 r | j        d| j                 dz   | _        dS dS dS )zATurn terminal 'y' to 'i' when there is another vowel in the stem.r   Nr   )r-   r!   r   r   r
   s    r   _step1czPorterStemmer._step1cF  sW    ::c?? 	+t0022 	+VGTVG_s*DFFF	+ 	+ 	+ 	+r   c                    | j         | j        dz
           }|dk    rZ|                     d          r|                     d           d+S |                     d          r|                     d           d+S d+S |dk    rZ|                     d          r|                     d	           d+S |                     d
          r|                     d           d+S d+S |dk    r.|                     d          r|                     d           d+S d+S |dk    r|                     d          r|                     d           d+S |                     d          r|                     d           d+S |                     d          r|                     d           d+S |                     d          r|                     d           d+S |                     d          r|                     d           d+S d+S |dk    r|                     d          r|                     d           d+S |                     d          r|                     d           d+S |                     d          r|                     d           d+S d+S |dk    r|                     d          r|                     d           d+S |                     d          r|                     d            d+S |                     d!          r|                     d"           d+S |                     d#          r|                     d           d+S d+S |d$k    r|                     d%          r|                     d           d+S |                     d&          r|                     d            d+S |                     d'          r|                     d           d+S d+S |d(k    r,|                     d)          r|                     d*           d+S d+S d+S ),zMap double suffices to single ones.

        So, -ization ( = -ize plus -ation) maps to -ize etc. Note that the
        string before the suffix must give _m() > 0.

        r   aationalr:   tionaltioncencienceanciancer@   izerr>   lblir<   allialentlienteliousliousoizationationatorr+   alismivenessivefulnessfulousnesstalitiivitibilitiglogilogNr   r   r-   r2   r   r   s     r   _step2zPorterStemmer._step2K  s    VDFQJ9 3	zz)$$  H%%     3Y .	zz&!!  F##     3Y )	zz&!!  3Y &	zz%    F## G$$ E"" G$$  3Y 	zz)$$ G$$ F##  3Y 	zz'"" I&& I&& I&&  3Y 		zz'"" G$$ H%%  3Y 	zz&!! 	 	 r   c                    | j         | j                 }|dk    r|                     d          r|                     d           dS |                     d          r|                     d           dS |                     d          r|                     d           dS dS |dk    r.|                     d	          r|                     d           dS dS |d
k    rZ|                     d          r|                     d           dS |                     d          r|                     d           dS dS |dk    r,|                     d          r|                     d           dS dS dS )z=Deal with -ic-, -full, -ness etc. Similar strategy to _step2.r@   icateicativer   alizerR   r   icitirO   icalr`   r+   nessNri   rj   s     r   _step3zPorterStemmer._step3  s   VDF^9 	zz'"" G$$ G$$  3Y 
	zz'""  3Y 	zz&!! E""  3Y 	zz&!! 	 	 r   c                    | j         | j        dz
           }|dk    r|                     d          sdS n|dk    r.|                     d          s|                     d          sdS n|dk    r|                     d	          sdS n|d
k    r|                     d          sdS n|dk    r.|                     d          s|                     d          sdS nX|dk    r]|                     d          rn;|                     d          rn$|                     d          rn|                     d          rndS |dk    rB|                     d          r| j         | j                 dv rn|                     d          rndS |dk    r|                     d          sdS n|dk    r-|                     d          s|                     d          sdS n\|dk    r|                     d          sdS n>|dk    r|                     d           sdS n |d!k    r|                     d"          sdS ndS |                                 dk    r| j        | _        dS dS )#z2Takes off -ant, -ence etc., in context <c>vcvc<v>.r   rE   rR   NrI   rM   rK   r@   err   rn   rO   ableibler   antementmentrT   rX   ionstour+   ismrb   r:   itiurW   vr^   zr>   )r   r   r-   r	   r   rj   s     r   _step4zPorterStemmer._step4  s   VDFQJ9 2	::d## 3Y /	::f%% djj.@.@ 3Y ,	::d## 3Y )	::d## 3Y &	::f%% djj.@.@ 3Y #	zz%   	G$$ F## E"" 3Y 	zz%   TVDF^t%; D!!  3Y 	::e$$ 3Y 	::e$$ TZZ->-> 3Y 
	::e$$ 3Y 	::e$$ 3Y 	::e$$  F7799q= 	VDFFF	 	r   c                    | j         x}| _        | j        |         dk    rH|                                 }|dk    s|dk    r(|                     |dz
            s| xj         dz  c_         | j        | j                  dk    rD|                     | j                   r,|                                 dk    r| xj         dz  c_         dS dS dS dS )z?Remove a final -e if _m() > 1, and change -ll to -l if m() > 1.r@   r   rO   N)r   r	   r   r   r'   r#   )r   r   rE   s      r   _step5zPorterStemmer._step5  s    VDF6!9 			A1u a 		!a%(8(8 !6$&>S  	T]]46%:%: 	twwyy1} 	FFaKFFFF	 	 	 	 	 	r   c                    |                                 }t          |          dz
  }|dk    r|S || _        || _        |                                  |                                  |                                  |                                  |                                  | 	                                 | j        d| j        dz            S )a  Stem the word `w`.

        Parameters
        ----------
        w : str

        Returns
        -------
        str
            Stemmed version of `w`.

        Examples
        --------

        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.stem("ponies")
            'poni'

        r   N)
lowerr*   r   r   rA   rC   rk   rt   r   r   )r   wr   s      r   stemzPorterStemmer.stem  s    . GGIIFFQJ6 	H vktvzk""r   c                 j     d                      fd|                                D                       S )a  Stem the sentence `txt`.

        Parameters
        ----------
        txt : str
            Input sentence.

        Returns
        -------
        str
            Stemmed sentence.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.stem_sentence("Wow very nice woman with apple")
            'wow veri nice woman with appl'

         c              3   B   K   | ]}                     |          V  d S r   )r   r   xr   s     r   r   z.PorterStemmer.stem_sentence.<locals>.<genexpr>%  s-      ::		!::::::r   )joinsplit)r   txts   ` r   stem_sentencezPorterStemmer.stem_sentence  s4    . xx::::ciikk::::::r   c                        fd|D             S )a  Stem documents.

        Parameters
        ----------
        docs : list of str
            Input documents

        Returns
        -------
        list of str
            Stemmed documents.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.parsing.porter import PorterStemmer
            >>> p = PorterStemmer()
            >>> p.stem_documents(["Have a very nice weekend", "Have a very nice weekend"])
            ['have a veri nice weekend', 'have a veri nice weekend']

        c                 :    g | ]}                     |          S  )r   r   s     r   
<listcomp>z0PorterStemmer.stem_documents.<locals>.<listcomp>>  s'    444!""1%%444r   r   )r   docss   ` r   stem_documentszPorterStemmer.stem_documents'  s    . 5444t4444r   N)__name__
__module____qualname____doc__r   r   r   r!   r#   r'   r-   r0   r2   rA   rC   rk   rt   r   r   r   r   r   r   r   r   r   r   !   s:          
  B4 4 4lA A A2F F F<$& $& $&L     D	! 	! 	!  ,! ,! ,!\+ + +
; ; ;|  ,7 7 7r  *# *# *#X; ; ;25 5 5 5 5r   r   __main__r   Nr   )r   r   r   syspargvfopeninfilelineprintr   r   r   r   <module>r      s$   <]5 ]5 ]5 ]5 ]5 ]5 ]5 ]5@ z -JJJAXabb\ - -T!WW 	- - -aood++,,,,-	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	-- -
- -s    $A11A5	8A5	