
    Qd5                     P    d Z 	 ddlZn# e$ r Y nw xY wddZd Zd Zdd
ZddZdS )a  
Text Segmentation Metrics

1. Windowdiff

Pevzner, L., and Hearst, M., A Critique and Improvement of
  an Evaluation Metric for Text Segmentation,
  Computational Linguistics 28, 19-36


2. Generalized Hamming Distance

Bookstein A., Kulyukin V.A., Raita T.
Generalized Hamming Distance
Information Retrieval 5, 2002, pp 353-375

Baseline implementation in C++
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html

Study describing benefits of Generalized Hamming Distance Versus
WindowDiff for evaluating text segmentation tasks
Begsten, Y.  Quel indice pour mesurer l'efficacite en segmentation de textes ?
TALN 2009


3. Pk text segmentation metric

Beeferman D., Berger A., Lafferty J. (1999)
Statistical Models for Text Segmentation
Machine Learning, 34, 177-210
    N1Fc                    t          |           t          |          k    rt          d          |t          |           k    rt          d          d}t          t          |           |z
  dz             D ]k}t          | |||z                                |          ||||z                                |          z
            }|r||z  }X|t          d|          z  }l|t          |           |z
  dz   z  S )aW  
    Compute the windowdiff score for a pair of segmentations.  A
    segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

        >>> s1 = "000100000010"
        >>> s2 = "000010000100"
        >>> s3 = "100000010000"
        >>> '%.2f' % windowdiff(s1, s1, 3)
        '0.00'
        >>> '%.2f' % windowdiff(s1, s2, 3)
        '0.30'
        >>> '%.2f' % windowdiff(s2, s3, 3)
        '0.80'

    :param seg1: a segmentation
    :type seg1: str or list
    :param seg2: a segmentation
    :type seg2: str or list
    :param k: window width
    :type k: int
    :param boundary: boundary value
    :type boundary: str or int or bool
    :param weighted: use the weighted variant of windowdiff
    :type weighted: boolean
    :rtype: float
    z!Segmentations have unequal lengthzCWindow width k should be smaller or equal than segmentation lengthsr            ?)len
ValueErrorrangeabscountmin)seg1seg2kboundaryweightedwdindiffs           9lib/python3.11/site-packages/nltk/metrics/segmentation.py
windowdiffr   1   s   < 4yyCII<===3t99}}Q
 
 	
 
B3t99q=1$%%    DQUO))(33d1q1u9o6K6KH6U6UUVV 	 %KBB#a--BBTQ$%%    c                     t          j        | |f          }|t          j        |          z  |dd d f<   |t          j        |           z  |d d df<   |S )Nr   )npemptyarange)nrowsncolsins_costdel_costmats        r   	_init_matr!   b   sZ    
(E5>
"
"C29U+++C111I29U+++C1IJr   c                 @   t          |          D ]\  }}t          |          D ]x\  }}	|t          ||	z
            z  | ||f         z   }
||	k    r| ||f         }n'||	k    r|| ||dz   f         z   }n|| |dz   |f         z   }t          ||
          | |dz   |dz   f<   yd S )Nr   )	enumerater
   r   )r    rowvcolvr   r   shift_cost_coeffr   rowijcolj
shift_costtcosts               r   _ghd_auxr,   i   s    T?? 7 74  	7 	7GAt)Ct,<,<<s1a4yHJt||AqD	 3q!a%x=0 !3q1uax=0 #E: 6 6CAq1u	77 7r          @r   c                 t   fdt          |           D             }fdt          |          D             }t          |          }t          |          }	|dk    r|	dk    rdS |dk    r|	dk    r||z  S |dk    r|	dk    r|	|z  S t          |	dz   |dz   ||          }
t          |
|||||           |
d         S )av  
    Compute the Generalized Hamming Distance for a reference and a hypothetical
    segmentation, corresponding to the cost related to the transformation
    of the hypothetical segmentation into the reference segmentation
    through boundary insertion, deletion and shift operations.

    A segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

    Recommended parameter values are a shift_cost_coeff of 2.
    Associated with a ins_cost, and del_cost equal to the mean segment
    length in the reference segmentation.

        >>> # Same examples as Kulyukin C++ implementation
        >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
        0.5
        >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
        2.0
        >>> ghd('011', '110', 1.0, 1.0, 0.5)
        1.0
        >>> ghd('1', '0', 1.0, 1.0, 0.5)
        1.0
        >>> ghd('111', '000', 1.0, 1.0, 0.5)
        3.0
        >>> ghd('000', '111', 1.0, 2.0, 0.5)
        6.0

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the hypothetical segmentation
    :type hyp: str or list
    :param ins_cost: insertion cost
    :type ins_cost: float
    :param del_cost: deletion cost
    :type del_cost: float
    :param shift_cost_coeff: constant used to compute the cost of a shift.
        ``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j``
        are the positions indicating the shift
    :type shift_cost_coeff: float
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    c                 &    g | ]\  }}|k    |S  r0   .0r   valr   s      r   
<listcomp>zghd.<locals>.<listcomp>   "    CCCXa3(??q???r   c                 &    g | ]\  }}|k    |S r0   r0   r1   s      r   r4   zghd.<locals>.<listcomp>   r5   r   r   g        r   )r7   )r#   r   r!   r,   )refhypr   r   r&   r   ref_idxhyp_idx
nref_bound
nhyp_boundr    s        `     r   ghdr>   y   s    \ DCCC3CCCGCCCC3CCCGWJWJQ:??s	aJ!OOH$$	qZ!^^H$$
JNJNHh
G
GCS'7Hh8HIIIv;r   c                    |Bt          t          t          |           |                     |          dz  z                      }d}t	          t          |           |z
  dz             D ]U}| |||z                                |          dk    }||||z                                |          dk    }||k    r|dz  }V|t          |           |z
  dz   z  S )a  
    Compute the Pk metric for a pair of segmentations A segmentation
    is any sequence over a vocabulary of two items (e.g. "0", "1"),
    where the specified boundary value is used to mark the edge of a
    segmentation.

    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
    '0.00'

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the segmentation to evaluate
    :type hyp: str or list
    :param k: window size, if None, set to half of the average reference segment length
    :type boundary: str or int or bool
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    Nr-   r   r   r   )introundr   r   r	   )r8   r9   r   r   errr   rhs           r   pkrE      s    2 	yc#hh#))H"5"5";<==>>
C3s88a<!#$$  AE	N  **Q.AE	N  **Q.661HC#c((Q,$%%r   )r   F)r-   r-   r   r   )Nr   )	__doc__numpyr   ImportErrorr   r!   r,   r>   rE   r0   r   r   <module>rI      s    @	 	 	 	D	+& +& +& +&b  7 7 7 = = = =F"& "& "& "& "& "&s   	 