
    c+                         d Z ddlmZ ddlZddlZddlmZ ddlZddl	m
Z
mZmZmZ ddlmZmZmZ ddlmZ dadada G d d	ej                  Zed
k    r( ej        dej                    ej                     dS dS )a  
Automated test to reproduce the results of Lee et al. (2005)

Lee et al. (2005) compares different models for semantic
similarity and verifies the results with similarity judgements from humans.

As a validation of the gensim implementation we reproduced the results
of  Lee et al. (2005) in this test.

Many thanks to Michael D. Lee (michael.lee@adelaide.edu.au) who provideded us
with his corpus and similarity data.

If you need to reference this dataset, please cite:

Lee, M., Pincombe, B., & Welsh, M. (2005).
An empirical evaluation of models of text document similarity.
Proceedings of the 27th Annual Conference of the Cognitive Science Society
    )with_statementN)partial)corporamodelsutilsmatutils)preprocess_documentspreprocess_stringDEFAULT_FILTERS)datapathc                        e Zd Zd Zd Zd ZdS )TestLeeTestc                 6   t          d          }t          d          }t          d          }t          t          j        d          t          j        |d          5 }t          fd|D                       addd           n# 1 swxY w Y   t          j        |d          5 }t          fd	|D                       addd           n# 1 swxY w Y   t          j        |d          5 }fd
|D             addd           n# 1 swxY w Y   t          j        |d          5 }fd|D             a	ddd           n# 1 swxY w Y   t          j        |          }t          j        |          d         }|t          j        |d                   adS )zsetup lee test corporazlee_background.corzlee.corzsimilarities0-1.txtlatin1)encodingrbc              3   .   K   | ]} |          V  d S N .0liner   s     4lib/python3.11/site-packages/gensim/test/test_lee.py	<genexpr>z$TestLeeTest.setUp.<locals>.<genexpr>7   s+      ,H,HdVVD\\,H,H,H,H,H,H    Nc              3   .   K   | ]} |          V  d S r   r   r   s     r   r   z$TestLeeTest.setUp.<locals>.<genexpr>9   s+      )E)E4&&,,)E)E)E)E)E)Er   c                 ^    g | ])}t           |          t          d d                   *S N)filtersr
   r   r   sr   s     r   
<listcomp>z%TestLeeTest.setUp.<locals>.<listcomp>;   s:    ```YZ+FF1IIsPRs?STTT```r   c                 ^    g | ])}t           |          t          d d                   *S r   r!   r"   s     r   r$   z%TestLeeTest.setUp.<locals>.<listcomp>=   s9    ]]]VW(OCRC<PQQQ]]]r   r      )r   r   r   
to_unicodeopenr	   	bg_corpuscorpus
bg_corpus2corpus2nploadtxtshapetriu_indiceshuman_sim_vector)selfbg_corpus_filecorpus_filesim_filef
sim_matrix
sim_m_sizer   s          @r   setUpzTestLeeTest.setUp,   s    ""677y))122 )H===Z-- 	I,,H,H,H,Ha,H,H,HHHI	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	I 	IZT** 	Fa))E)E)E)E1)E)E)EEEF	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	F 	FZ-- 	a````^_```J	a 	a 	a 	a 	a 	a 	a 	a 	a 	a 	a 	a 	a 	a 	aZT** 	^a]]]][\]]]G	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^ Z))
Xj))!,
%boj!&D&DEsH   BBB'CCC/D

DD*EE	E	c                    d}d}d}|                      t          t                    |           |                      t          t                    |           |                      t          t                    |           dS )z$availability and integrity of corpusi,  2   i  N)assertEquallenr)   r*   r1   )r2   documents_in_bg_corpusdocuments_in_corpuslen_sim_vectors       r   test_corpuszTestLeeTest.test_corpusD   sr    !$ Y)?@@@V&9:::-..?????r   c                    t          j        t                    fdt          D             afdt          D             at	          j        t                    }|t                   }t	          j        |d          }||t                            }t          j        t          t                    t          t                    f          }t          |          D ]4\  }}t          |          D ]\  }}	t          j        ||	          |||f<    5|t          j        t          t                    d                   }
t          j        |
t                    d         }t!          j        d|           |                     |dk               d	S )
zocorrelation with human data > 0.6
        (this is the value which was achieved in the original paper)
        c                 :    g | ]}                     |          S r   doc2bowr   text
dictionarys     r   r$   z(TestLeeTest.test_lee.<locals>.<listcomp>V   s'    DDD$Z''--DDDr   c                 :    g | ]}                     |          S r   rD   rF   s     r   r$   z(TestLeeTest.test_lee.<locals>.<listcomp>W   s'    >>>t*$$T**>>>r      )id2word
num_topicsr&   )r   r&   z!LSI correlation coefficient is %sg333333?N)r   
Dictionaryr)   r*   r   LogEntropyModelLsiModelr-   zerosr=   	enumerater   cossimr0   corrcoefr1   logginginfo
assertTrue)r2   log_entbg_corpus_entlsi
corpus_lsiresipar1jpar2flatcorrH   s               @r   test_leezTestLeeTest.test_leeM   sc    '	22
DDDD)DDD	>>>>v>>> (33	* omZCPPP)
 hFS[[122 ,, 	8 	8GAt$Z00 8 84$OD$77AqD		82?3v;;223k$ 011$78#>>>c	"""""r   N)__name__
__module____qualname__r9   rA   rb   r   r   r   r   r   +   sG        F F F0@ @ @# # # # #r   r   __main__z)%(asctime)s : %(levelname)s : %(message)s)formatlevel)__doc__
__future__r   rT   unittest	functoolsr   numpyr-   gensimr   r   r   r   gensim.parsing.preprocessingr	   r
   r   gensim.test.utilsr   r)   r*   r1   TestCaser   rc   basicConfigDEBUGmainr   r   r   <module>ru      s4   & & % % % % %             3 3 3 3 3 3 3 3 3 3 3 3 a a a a a a a a a a & & & & & &		 @# @# @# @# @#(# @# @# @#F z GJRYR_````HMOOOOO r   