
    Qd@                        d Z ddlZddlmZmZ ddlmZ d Zej	        Z
d ZdZ	 ddlmZ n# e$ r d	 ZY nw xY wdZ	 d
Z	 dZ	  G d de          Z G d de          Z G d de          Z G d de          Z G d d          ZdS )z
Provides scoring functions for a number of association measures through a
generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
    N)ABCMetaabstractmethodreducec                 *    t          j        |           S N)_mathlog2)xs    8lib/python3.11/site-packages/nltk/metrics/association.py<lambda>r      s    %*Q--     c                 $    t          d |           S )Nc                     | |z  S r    )r   ys     r   r   z<lambda>.<locals>.<lambda>   s
    Q r   r   )ss    r   r   r      s    V..22 r   g#B;)fisher_exactc                      t           r   NotImplementedError)_args_kwargss     r   r   r      s    !!r   c                   (   e Zd ZdZdZeed                         Zeed                         Ze	d             Z
ed             Ze	d             Ze	d             Zed	             Ze	d
             Ze	d             Ze	d             Ze	d             ZdS )NgramAssocMeasuresa  
    An abstract class defining a collection of generic association measures.
    Each public method returns a score, taking the following arguments::

        score_fn(count_of_ngram,
                 (count_of_n-1gram_1, ..., count_of_n-1gram_j),
                 (count_of_n-2gram_1, ..., count_of_n-2gram_k),
                 ...,
                 (count_of_1gram_1, ..., count_of_1gram_n),
                 count_of_total_words)

    See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``

    Inheriting classes should define a property _n, and a method _contingency
    which calculates contingency values from marginals in order for all
    association measures defined here to be usable.
    r   c                       t          d          )z>Calculates values of a contingency table from marginal values.?The contingency table is not availablein the general ngram caser   	marginalss    r   _contingencyzNgramAssocMeasures._contingencyB        "P
 
 	
r   c                       t          d          )ACalculates values of contingency table marginals from its values.r   r   )contingencys    r   
_marginalszNgramAssocMeasures._marginalsJ   r#   r   c              #      K   t                    }d t           j                  D             }t          t                              D ]/t	           fd|D                       | j        dz
  z  z  V  0dS )3Calculates expected values for a contingency table.c                     g | ]}d |z  S )   r   ).0is     r   
<listcomp>z7NgramAssocMeasures._expected_values.<locals>.<listcomp>V   s    ...1Q...r   c              3   ~   K   | ]6t          fd t          dj        z            D                       V  7dS )c              3   @   K   | ]}|z  z  k    |         V  d S r   r   )r,   r   contr-   js     r   	<genexpr>z@NgramAssocMeasures._expected_values.<locals>.<genexpr>.<genexpr>]   s9      PPAa!eQ=O=OQ=O=O=O=OPPr      N)sumrange_n)r,   r2   clsr1   r-   s    @r   r3   z6NgramAssocMeasures._expected_values.<locals>.<genexpr>\   si         PPPPPPq#&y)9)9PPPPP     r   r+   N)r5   r6   r7   len_product)r8   r1   n_allbitsr-   s   ``  @r   _expected_valuesz#NgramAssocMeasures._expected_valuesR   s       D		..cf... s4yy!! 	 	A       !     SVaZ(	*   	 	r   c                  8    | t                    | t                   z  S )z Scores ngrams by their frequency)NGRAMTOTALr    s    r   raw_freqzNgramAssocMeasures.raw_freqc   s     )E"222r   c                     |t                    t          |t                             |t                   | j        dz
  z  z  z
  |t                    t
          z   dz  z  S )zScores ngrams using Student's t test with independence hypothesis
        for unigrams, as in Manning and Schutze 5.3.1.
        r+   g      ?)r?   r:   UNIGRAMSr@   r7   _SMALLr8   r!   s     r   	student_tzNgramAssocMeasures.student_th   sS     ey*++y/?CFQJ/OPQu&3./ 	/r   c                      | j         | }|                     |          }t          d t          ||          D                       S )zZScores ngrams using Pearson's chi-square as in Manning and Schutze
        5.3.3.
        c              3   B   K   | ]\  }}||z
  d z  |t           z   z  V  dS )r4   N)rD   r,   obsexps      r   r3   z,NgramAssocMeasures.chi_sq.<locals>.<genexpr>y   s8      UUcC#I!#sV|4UUUUUUr   )r"   r=   r5   zip)r8   r!   r1   expss       r   chi_sqzNgramAssocMeasures.chi_sqr   sK    
  s+##D))UUSt__UUUUUUr   c                      | t                    |                    dd          z  t          | t                             z  S )zScores ngrams using a variant of mutual information. The keyword
        argument power sets an exponent (default 3) for the numerator. No
        logarithm of the result is calculated.
        power   )r?   getr:   rC   )r!   kwargss     r   mi_likezNgramAssocMeasures.mi_like{   s=     6::gq#9#99Hh=
 =
 
 	
r   c                     t          |t                   |t                   | j        dz
  z  z            t          t	          |t
                                       z
  S )z^Scores ngrams by pointwise mutual information, as in Manning and
        Schutze 5.4.
        r+   )_log2r?   r@   r7   r:   rC   rE   s     r   pmizNgramAssocMeasures.pmi   sQ    
 Yu%	%(8SVaZ(HHIIEYx())M
 M
 
 	
r   c           
           | j         | }dt          d t          ||                     |                    D                       z  S )zFScores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.r4   c              3   f   K   | ],\  }}|t          ||t          z   z  t          z             z  V  -d S r   )_lnrD   rI   s      r   r3   z6NgramAssocMeasures.likelihood_ratio.<locals>.<genexpr>   sU       
 
S #cS6\*V3444
 
 
 
 
 
r   )r"   r5   rL   r=   r8   r!   r1   s      r   likelihood_ratioz#NgramAssocMeasures.likelihood_ratio   s`      s+3 
 
c&:&:4&@&@AA
 
 
 
 
 
 	
r   c                     t          |t                             |t                   | j        dz
  z  z  }|t                   t          |t                   |z            dz
  z  S )z1Scores ngrams using the Poisson-Stirling measure.r+   )r:   rC   r@   r7   r?   rV   )r8   r!   rK   s      r   poisson_stirlingz#NgramAssocMeasures.poisson_stirling   sR     y*++y/?CFQJ/OP55)9C)?#@#@1#DEEr   c                 V     | j         | }|d         t          |dd                   z  S )z&Scores ngrams using the Jaccard index.r   Nr   )r"   r5   r[   s      r   jaccardzNgramAssocMeasures.jaccard   s0      s+AwT#2#Y''r   N)__name__
__module____qualname____doc__r7   staticmethodr   r"   r'   classmethodr=   rA   rF   rN   rT   rW   r\   r^   r`   r   r   r   r   r   -   sf        $ 
B
 
 ^ \
 
 
 ^ \
   [  3 3 \3 / / [/ V V [V 
 
 \
 
 
 [
 
 
 [
 F F [F
 ( ( [( ( (r   r   )	metaclassc                       e Zd ZdZdZed             Zed             Zed             Ze	d             Z
e	d             Ze	d             Zed	             Zd
S )BigramAssocMeasuresa  
    A collection of bigram association measures. Each association measure
    is provided as a function with three arguments::

        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_ii counts ``(w1, w2)``, i.e. the bigram being scored
    - n_ix counts ``(w1, *)``
    - n_xi counts ``(*, w2)``
    - n_xx counts ``(*, *)``, i.e. any bigram

    This may be shown with respect to a contingency table::

                w1    ~w1
             ------ ------
         w2 | n_ii | n_oi | = n_xi
             ------ ------
        ~w2 | n_io | n_oo |
             ------ ------
             = n_ix        TOTAL = n_xx
    r4   c                 >    |\  }}|| z
  }|| z
  }| |||| z
  |z
  |z
  fS )zECalculates values of a bigram contingency table from marginal values.r   )n_iin_ix_xi_tuplen_xxn_ixn_xin_oin_ios          r   r"   z BigramAssocMeasures._contingency   s<     %td{d{dD$+"4t";<<r   c                 .    | || z   || z   f||z   |z   | z   fS )r%   r   )rk   rp   rq   n_oos       r   r'   zBigramAssocMeasures._marginals   s,     td{D4K0$+2Dt2KLLr   c              #      K   t          |           }t          d          D ]0}| |         | |dz           z   | |         | |dz           z   z  |z  V  1dS )r)      r+   r4   N)r5   r6   )r1   rm   r-   s      r   r=   z$BigramAssocMeasures._expected_values   sq       4yyq 	K 	KA7T!a%[(T!WtAE{-BCdJJJJJ	K 	Kr   c                 l     | j         | \  }}}}||z  ||z  z
  dz  ||z   ||z   z  ||z   z  ||z   z  z  S )zdScores bigrams using phi-square, the square of the Pearson correlation
        coefficient.
        r4   )r"   )r8   r!   rk   rq   rp   rs   s         r   phi_sqzBigramAssocMeasures.phi_sq   s]    
 "2!19!=dD$tdTk)a/D[TD[)TD[9TD[I
 	
r   c                 D    |\  }}||                      |||f|          z  S )zScores bigrams using chi-square, i.e. phi-sq multiplied by the number
        of bigrams, as in Manning and Schutze 5.3.3.
        )rw   )r8   rk   rl   rm   rn   ro   s         r   rN   zBigramAssocMeasures.chi_sq   s,    
 %tcjjd|T::::r   c                 X     | j         | \  }}}}t          ||g||ggd          \  }}|S )zScores bigrams using Fisher's Exact Test (Pedersen 1996).  Less
        sensitive to small counts than PMI or Chi Sq, but also more expensive
        to compute. Requires scipy.
        less)alternative)r"   r   )r8   r!   rk   rq   rp   rs   oddspvalues           r   fisherzBigramAssocMeasures.fisher   sE     "2!19!=dD$%d|dD\&BPVWWWvr   c                 "    |\  }}d| z  ||z   z  S )z(Scores bigrams using Dice's coefficient.r4   r   )rk   rl   rm   rn   ro   s        r   dicezBigramAssocMeasures.dice   s      %t4x4$;''r   N)ra   rb   rc   rd   r7   re   r"   r'   r=   rf   rw   rN   r~   r   r   r   r   ri   ri      s         6 
B= = \= M M \M K K \K 
 
 [
 ; ; [; 	 	 [	 ( ( \( ( (r   ri   c                   B    e Zd ZdZdZed             Zed             ZdS )TrigramAssocMeasuresa  
    A collection of trigram association measures. Each association measure
    is provided as a function with four arguments::

        trigram_score_fn(n_iii,
                         (n_iix, n_ixi, n_xii),
                         (n_ixx, n_xix, n_xxi),
                         n_xxx)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored
    - n_ixx counts ``(w1, *, *)``
    - n_xxx counts ``(*, *, *)``, i.e. any trigram
    rQ   c                     |\  }}}|\  }}}	|| z
  }
|| z
  }|| z
  }|	| z
  |
z
  |z
  }|| z
  |
z
  |z
  }|| z
  |z
  |z
  }|| z
  |
z
  |z
  |z
  |z
  |z
  |z
  }| |
||||||fS )zCalculates values of a trigram contingency table (or cube) from
        marginal values.
        >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
        (1, 0, 0, 0, 0, 72, 0, 1927)
        r   )n_iiin_iix_tuplen_ixx_tuplen_xxxn_iixn_ixin_xiin_ixxn_xixn_xxin_oiin_ioin_iion_ooin_oion_ioon_ooos                    r   r"   z!TrigramAssocMeasures._contingency  s     !,u +u%-%-%-%-5=EMueUE5%GGr   c                      | \  }}}}}}}}|||z   ||z   ||z   f||z   |z   |z   ||z   |z   |z   ||z   |z   |z   ft          |           fS )zCalculates values of contingency table marginals from its values.
        >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
        (1, (1, 1, 1), (1, 73, 1), 2000)
        r5   )	r&   r   r   r   r   r   r   r   r   s	            r   r'   zTrigramAssocMeasures._marginals&  s     BM>ueUE5%U]EEM55=9%-%-%-
 	
 		
r   Nra   rb   rc   rd   r7   re   r"   r'   r   r   r   r   r      s\         & 
BH H \H$ 
 
 \
 
 
r   r   c                   B    e Zd ZdZdZed             Zed             ZdS )QuadgramAssocMeasuresaF  
    A collection of quadgram association measures. Each association measure
    is provided as a function with five arguments::

        trigram_score_fn(n_iiii,
                        (n_iiix, n_iixi, n_ixii, n_xiii),
                        (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
                        (n_ixxx, n_xixx, n_xxix, n_xxxi),
                        n_all)

    The arguments constitute the marginals of a contingency table, counting
    the occurrences of particular events in a corpus. The letter i in the
    suffix refers to the appearance of the word in question, while x indicates
    the appearance of any word. Thus, for example:

    - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored
    - n_ixxi counts ``(w1, *, *, w4)``
    - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram
    ru   c                    |\  }}}}|\  }	}
}}}}|\  }}}}|| z
  }|| z
  }|| z
  }|| z
  |z
  |z
  }|| z
  |z
  |z
  }|| z
  |z
  |z
  }|| z
  |z
  |z
  |z
  |z
  |z
  |z
  }|| z
  }|| z
  |z
  |z
  }|
| z
  |z
  |z
  }|| z
  |z
  |z
  |z
  |z
  |z
  |z
  }|	| z
  |z
  |z
  }|| z
  |z
  |z
  |z
  |z
  |z
  |z
  }|| z
  |z
  |z
  |z
  |z
  |z
  |z
  } || z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  |z
  | z
  }!| |||||||||||||| |!fS )zXCalculates values of a quadgram contingency table from
        marginal values.
        r   )"n_iiiin_iiix_tuplen_iixx_tuplen_ixxx_tuplen_xxxxn_iiixn_iixin_ixiin_xiiin_iixxn_ixixn_ixxin_xixin_xxiin_xiixn_ixxxn_xixxn_xxixn_xxxin_oiiin_ioiin_iioin_ooiin_oioin_iooin_oooin_iiion_oiion_ioion_ooion_iioon_oioon_iooon_oooos"                                     r   r"   z"QuadgramAssocMeasures._contingencyP  s>   
 ,8(;G8+7(&&&&6)F2&6)F2&6)F2&6)F2V;fDvMPVV&&6)F2&6)F2&6)F2V;fDvMPVV&6)F2&6)F2V;fDvMPVV&6)F2V;fDvMPVV  	
     	 
      	( !
 	
r   c                     | \  }}}}}}}}}	}
}}}}}}||	z   }||z   }||z   }||z   }||z   |	z   |z   }||z   |	z   |z   }||z   |z   |z   }||z   |z   |z   }||z   |z   |z   }||z   |	z   |
z   }||z   |z   |	z   |z   |z   |z   |z   }||z   |z   |	z   |z   |
z   |z   |z   }||z   |z   |	z   |z   |z   |
z   |z   }||z   |z   |z   |z   |z   |z   |z   }t          |           }|||||f||||||f||||f|fS )a  Calculates values of contingency table marginals from its values.
        QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
        (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
        r   ) r&   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r;   s                                    r   r'   z QuadgramAssocMeasures._marginals  s   . #	
 &&&&&6)F2&6)F2&6)F2&6)F2&6)F2&6)F2&6)F2V;fDvMPVV&6)F2V;fDvMPVV&6)F2V;fDvMPVV&6)F2V;fDvMPVVK   VVV,VVVVV<VVV,
 	
r   Nr   r   r   r   r   r   9  sZ         ( 
B9
 9
 \9
v 1
 1
 \1
 1
 1
r   r   c                   .    e Zd ZdZd Zed             ZdS )ContingencyMeasureszWraps NgramAssocMeasures classes such that the arguments of association
    measures are contingency table values rather than marginals.
    c                     d|j         j        z   | j         _        t          |          D ]d}|                    d          rt	          ||          }|                    d          s|                     ||          }t          | ||           edS )zAConstructs a ContingencyMeasures given a NgramAssocMeasures classContingency___N)	__class__ra   dir
startswithgetattr_make_contingency_fnsetattr)selfmeasureskvs       r   __init__zContingencyMeasures.__init__  s    "/(2D2M"MX 	  	 A||D!! !$$A<<$$ ;--h::D!Q	  	 r   c                 F      fd}j         |_         j        |_        |S )zFrom an association measure function, produces a new function which
        accepts contingency table values as its arguments.
        c                        j         |   S r   )r'   )r&   r   old_fns    r   resz5ContingencyMeasures._make_contingency_fn.<locals>.res  s    6.8.<==r   )rd   ra   )r   r   r   s   `` r   r   z(ContingencyMeasures._make_contingency_fn  s;    	> 	> 	> 	> 	> 	> n
r   N)ra   rb   rc   rd   r   re   r   r   r   r   r   r     sH         	  	  	  
 
 \
 
 
r   r   )rd   mathr	   abcr   r   	functoolsr   rV   logrZ   r:   rD   scipy.statsr   ImportErrorr?   rC   r@   r   ri   r   r   r   r   r   r   <module>r      s        ' ' ' ' ' ' ' '      i22	"((((((( " " "" " " " "" 	
 ) 7
 9t( t( t( t( t(7 t( t( t( t(nV( V( V( V( V(, V( V( V(r9
 9
 9
 9
 9
- 9
 9
 9
xE
 E
 E
 E
 E
. E
 E
 E
P         s   , 77