
    Qd9                         d Z ddlmZ d Zd Zd Zd Zd Zd Zd	 Z	 G d
 d          Z
d Zedk    r e             dS dS )a  Counts Paice's performance statistics for evaluating stemming algorithms.

What is required:
 - A dictionary of words grouped by their real lemmas
 - A dictionary of words grouped by stems from a stemming algorithm

When these are given, Understemming Index (UI), Overstemming Index (OI),
Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted.

References:
Chris D. Paice (1994). An evaluation method for stemming algorithms.
In Proceedings of SIGIR, 42--50.
    )sqrtc                 |    t                      }| D ]*}|                    t          | |                              +|S )a5  
    Get original set of words used for analysis.

    :param lemmas: A dictionary where keys are lemmas and values are sets
        or lists of words corresponding to that lemma.
    :type lemmas: dict(str): list(str)
    :return: Set of words that exist as values in the dictionary
    :rtype: set(str)
    )setupdate)lemmaswordslemmas      2lib/python3.11/site-packages/nltk/metrics/paice.pyget_words_from_dictionaryr      sB     EEE ) )S''((((L    c                     i }| D ]?}|d|         }	 ||                              |g           *# t          $ r	 |h||<   Y <w xY w|S )a  Group words by stems defined by truncating them at given length.

    :param words: Set of words used for analysis
    :param cutlength: Words are stemmed by cutting at this length.
    :type words: set(str) or list(str)
    :type cutlength: int
    :return: Dictionary where keys are stems and values are sets of words
    corresponding to that stem.
    :rtype: dict(str): set(str)
    N)r   KeyError)r   	cutlengthstemswordstems        r
   	_truncater   *   sw     E ! !JYJ	!$Kv&&&& 	! 	! 	!&E$KKK	!Ls   /AAc                 x   | d         \  }}| d         \  }}|d         \  }}|d         \  }}	||z
  ||	z
  z  ||z
  ||z
  z  z
  }
|
dk    r"||cxk    r|cxk    r|cxk    rdk    rn nd|	fS ||z  ||z  z
  ||z
  z  ||z
  ||	z  ||z  z
  z  z
  |
z  }||z  ||z  z
  ||	z
  z  ||z
  ||	z  ||z  z
  z  z
  |
z  }||fS )a{  Count intersection between two line segments defined by coordinate pairs.

    :param l1: Tuple of two coordinate pairs defining the first line segment
    :param l2: Tuple of two coordinate pairs defining the second line segment
    :type l1: tuple(float, float)
    :type l2: tuple(float, float)
    :return: Coordinates of the intersection
    :rtype: tuple(float, float)
    r               )l1l2x1y1x2y2x3y3x4y4denominatorxys                r
   _count_intersectionr%   @   s<    UFBUFBUFBUFB7rBw'27rBw*??Kc&&&&r&&&&R&&&&3&&&&& 9 
b27	rBw'27rBwb7H*II	A 
b27	rBw'27rBwb7H*II	A q6Mr   c                 d    	 | d         | d         z  S # t           $ r t          d          cY S w xY w)zGet derivative of the line from (0,0) to given coordinates.

    :param coordinates: A coordinate pair
    :type coordinates: tuple(float, float)
    :return: Derivative; inf if x is zero
    :rtype: float
    r   r   infZeroDivisionErrorfloat)coordinatess    r
   _get_derivativer,   c   sG    1~A..   U||s    //c                     d\  }}|D ]p}t          |           t          ||                   z  }|rGt          |          }t          ||                   }||t          |           |z
  z  z  }||||z
  z  z  }q||fS )a	  Count understemmed and overstemmed pairs for (lemma, stem) pair with common words.

    :param lemmawords: Set or list of words corresponding to certain lemma.
    :param stems: A dictionary where keys are stems and values are sets
    or lists of words corresponding to that stem.
    :type lemmawords: set(str) or list(str)
    :type stems: dict(str): set(str)
    :return: Amount of understemmed and overstemmed pairs contributed by words
    existing in both lemmawords and stems.
    :rtype: tuple(float, float)
    r   r   )r   len)
lemmawordsr   umtwmtr   cutcutcount	stemcounts           r
   _calculate_cutr6   q   s     HC 5 5*ooE$K 0 00 	53xxHE$K((I8s:9::C8y8344C:r   c                     t           fd D                       }d\  }}}} D ]P}t           |                   }|||dz
  z  z  }||||z
  z  z  }t           |         |          \  }	}
||	z  }||
z  }Q|dz  |dz  |dz  |dz  fS )a  Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs.

    :param lemmas: A dictionary where keys are lemmas and values are sets
    or lists of words corresponding to that lemma.
    :param stems: A dictionary where keys are stems and values are sets
    or lists of words corresponding to that stem.
    :type lemmas: dict(str): list(str)
    :type stems: dict(str): set(str)
    :return: Global unachieved merge total (gumt),
    global desired merge total (gdmt),
    global wrongly merged total (gwmt) and
    global desired non-merge total (gdnt).
    :rtype: tuple(float, float, float, float)
    c              3   B   K   | ]}t          |                   V  d S Nr/   ).0r   r   s     r
   	<genexpr>z_calculate.<locals>.<genexpr>   s/      11$Ct111111r   )r   r   r   r   r      )sumr/   r6   )r   r   ngdmtgdntgumtgwmtr	   
lemmacountr1   r2   s   `          r
   
_calculaterE      s      	1111&11111A1D$d  ''
 	
j1n-- 	
a*n-- "&-77S 	 1HdQhq$(33r   c                     	 | |z  }n# t           $ r d}Y nw xY w	 ||z  }n# t           $ r d}Y nw xY w	 ||z  }n5# t           $ r( |dk    rt          d          }nt          d          }Y nw xY w|||fS )a  Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW).

    :param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt),
    global desired merge total (gdmt),
    global wrongly merged total (gwmt) and
    global desired non-merge total (gdnt).
    :type gumt, gdmt, gwmt, gdnt: float
    :return: Understemming Index (UI),
    Overstemming Index (OI) and
    Stemming Weight (SW).
    :rtype: tuple(float, float, float)
    r   nanr'   r(   )rB   r@   rC   rA   uioisws          r
   _indexesrK      s    D[   D[   "W   99uBB uB B<s'    ! 00: /A,+A,c                   8    e Zd ZdZd Zd Zd Zd
dZd Zd Z	d	S )Paicez7Class for storing lemmas, stems and evaluation metrics.c                     || _         || _        g | _        d\  | _        | _        | _        | _        d\  | _        | _        | _	        d| _
        |                                  dS )al  
        :param lemmas: A dictionary where keys are lemmas and values are sets
            or lists of words corresponding to that lemma.
        :param stems: A dictionary where keys are stems and values are sets
            or lists of words corresponding to that stem.
        :type lemmas: dict(str): list(str)
        :type stems: dict(str): set(str)
        )NNNN)NNNN)r   r   coordsrB   r@   rC   rA   rH   rI   rJ   errtr   )selfr   r   s      r
   __init__zPaice.__init__   sY     
5M2	49di$6!$'	r   c                 P   d| j         z  g}|                    d| j        z             |                    d| j        z             |                    d| j        z             |                    d| j        z             |                    d| j        z             |                    d| j        z             |                    d| j        z             d		                    d
 | j
        D                       }|                    d|z             d	                    |          S )Nz)Global Unachieved Merge Total (GUMT): %s
z&Global Desired Merge Total (GDMT): %s
z'Global Wrongly-Merged Total (GWMT): %s
z*Global Desired Non-merge Total (GDNT): %s
z&Understemming Index (GUMT / GDMT): %s
z%Overstemming Index (GWMT / GDNT): %s
zStemming Weight (OI / UI): %s
z.Error-Rate Relative to Truncation (ERRT): %s
 c                     g | ]}d |z  S )z(%s, %s)r   )r;   items     r
   
<listcomp>z!Paice.__str__.<locals>.<listcomp>   s    JJJd
T 1JJJr   zTruncation line: %s )rB   appendr@   rC   rA   rH   rI   rJ   rP   joinrO   )rQ   textr+   s      r
   __str__zPaice.__str__   s   <tyHI=	IJJJ>JKKKADIMNNN=GHHH<twFGGG5?@@@FRSSShhJJdkJJJKK)K7888wwt}}r   c                     t          ||          }t          | j        |          \  }}}}t          ||||          dd         \  }}	||	fS )ao  Count (UI, OI) when stemming is done by truncating words at 'cutlength'.

        :param words: Words used for the analysis
        :param cutlength: Words are stemmed by cutting them at this length
        :type words: set(str) or list(str)
        :type cutlength: int
        :return: Understemming and overstemming indexes
        :rtype: tuple(int, int)
        Nr=   )r   rE   r   rK   )
rQ   r   r   	truncatedrB   r@   rC   rA   rH   rI   s
             r
   _get_truncation_indexeszPaice._get_truncation_indexes   sV     eY//	!+DK!C!CdD$$dD11"1"5BBxr   r   c                    t          | j                  }t          d |D                       }g }||k    r|                     ||          }||vr|                    |           |dk    r|S t          |          dk    rM|d         dk    rAt          |d                   }t          |d                   }|| j        cxk    r|k    rn n|S |dz  }||k    |S )	a  Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line.

        :param cutlength: Optional parameter to start counting from (ui, oi)
        coordinates gotten by stemming at this length. Useful for speeding up
        the calculations when you know the approximate location of the
        intersection.
        :type cutlength: int
        :return: List of coordinate pairs that define the truncation line
        :rtype: list(tuple(float, float))
        c              3   4   K   | ]}t          |          V  d S r9   r:   )r;   r   s     r
   r<   z4Paice._get_truncation_coordinates.<locals>.<genexpr>  s(      44dD		444444r   r.   r=   r   r   r   )r   r   maxr_   rY   r/   r,   rJ   )rQ   r   r   	maxlengthrO   pairderivative1derivative2s           r
   _get_truncation_coordinatesz!Paice._get_truncation_coordinates	  s    *$+6644e44444	 9$$//yAAD 6!!d###z!! 6{{aDGcMM-fRj99-fRj99 $'8888[88888!MNI) 9$$* r   c                    |                                  | _        d| j        v r0| j        | j        fdk    rt	          d          S t	          d          S | j        | j        fdk    rdS t          d| j        | j        ff| j        dd                   }t          | j        dz  | j        dz  z             }t          |d	         dz  |d
         dz  z             }||z  S )a  Count Error-Rate Relative to Truncation (ERRT).

        :return: ERRT, length of the line from origo to (UI, OI) divided by
        the length of the line from origo to the point defined by the same
        line when extended until the truncation line.
        :rtype: float
        r.   r'   rG   r   )r   r   rb   Nr=   r   r   )ri   rO   rH   rI   r*   r%   r   )rQ   intersectionopots       r
   _errtzPaice._errt0  s     6688$$!Z//U||#U||#GTW++3 +dgtw'($+bcc*:
 
 $'1*twz)**,q/Q&aA)==>>Bwr   c                    t          | j        | j                  \  | _        | _        | _        | _        t          | j        | j        | j        | j                  \  | _        | _	        | _
        |                                 | _        dS )z7Update statistics after lemmas and stems have been set.N)rE   r   r   rB   r@   rC   rA   rK   rH   rI   rJ   rn   rP   )rQ   s    r
   r   zPaice.updateP  sa    5?TZ5X5X2	49di$,TY	49di$X$X!$'JJLL			r   N)r   )
__name__
__module____qualname____doc__rR   r\   r_   ri   rn   r   r   r   r
   rM   rM      s{        AA  "     % % % %N  @! ! ! ! !r   rM   c            
         ddgddgg dd} dgdgg ddgd	gd
}t          d           t          |           D ]>}t          d                    |d                    | |                                        ?t                       t          d           t          |          D ]>}t          d                    |d                    ||                                        ?t                       t	          | |          }t          |           t                       dgdgdgddgdgd	gd}t          d           t          |          D ]>}t          d                    |d                    ||                                        ?t                       ||_        |                                 t          |           dS )zDemonstration of the module.kneelkneltrangeranged)ringrangrung)ru   rw   ry   )rz   rw   rx   ry   r{   )ru   rv   rz   ry   r{   zWords grouped by their lemmas:z{} => {}rT   z+Same words grouped by a stemming algorithm:rz   )ru   rv   rz   rw   ry   r{   z/Counting stats after changing stemming results:N)printsortedformatrZ   rM   r   r   )r   r   r	   r   ps        r
   demor   W  s    7#8$((( F +++ E 

*+++ A Ajsxxu'>'>??@@@@	GGG	
7888u > >jchhuT{&;&;<<====	GGGfeA	!HHH	GGG 8$ E 

;<<<u > >jchhuT{&;&;<<====	GGGAGHHJJJ	!HHHHHr   __main__N)rs   mathr   r   r   r%   r,   r6   rE   rK   rM   r   rp   r   r   r
   <module>r      s               ,     F    2&4 &4 &4R" " "J|! |! |! |! |! |! |! |!~* * *Z zDFFFFF r   