
    cF                         d Z ddlZddlmZ ddlZddlmZ  G d d          Z G d dej	                  Z
 G d	 d
ej	                  ZdS )aA  Produce a translation matrix to translate words from one language to another, using either
a standard nearest neighbour method or a globally corrected neighbour retrieval method [1]_.

This method can be used to augment the existing phrase tables with more candidate translations, or
filter out errors from the translation tables and known dictionaries [2]_. What's more, it also works
for any two sets of named-vectors where there are some paired-guideposts to learn the transformation.

Examples
--------

How to make translation between two set of word-vectors
=======================================================

Initialize two word-vector models

.. sourcecode:: pycon

    >>> from gensim.models import KeyedVectors
    >>> from gensim.test.utils import datapath
    >>>
    >>> model_en = KeyedVectors.load_word2vec_format(datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt"))
    >>> model_it = KeyedVectors.load_word2vec_format(datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt"))

Define word pairs (that will be used for construction of translation matrix)

.. sourcecode:: pycon

    >>> word_pairs = [
    ...     ("one", "uno"), ("two", "due"), ("three", "tre"), ("four", "quattro"), ("five", "cinque"),
    ...     ("seven", "sette"), ("eight", "otto"),
    ...     ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"),
    ...     ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana")
    ... ]

Fit :class:`~gensim.models.translation_matrix.TranslationMatrix`

.. sourcecode:: pycon

    >>> trans_model = TranslationMatrix(model_en, model_it, word_pairs=word_pairs)

Apply model (translate words "dog" and "one")

.. sourcecode:: pycon

    >>> trans_model.translate(["dog", "one"], topn=3)
    OrderedDict([('dog', [u'cane', u'gatto', u'cavallo']), ('one', [u'uno', u'due', u'tre'])])


Save / load model

.. sourcecode:: pycon

    >>> with temporary_file("model_file") as fname:
    ...     trans_model.save(fname)  # save model to file
    ...     loaded_trans_model = TranslationMatrix.load(fname)  # load model


How to make translation between two :class:`~gensim.models.doc2vec.Doc2Vec` models
==================================================================================

Prepare data and models

.. sourcecode:: pycon

    >>> from gensim.test.utils import datapath
    >>> from gensim.test.test_translation_matrix import read_sentiment_docs
    >>> from gensim.models import Doc2Vec
    >>>
    >>> data = read_sentiment_docs(datapath("alldata-id-10.txt"))[:5]
    >>> src_model = Doc2Vec.load(datapath("small_tag_doc_5_iter50"))
    >>> dst_model = Doc2Vec.load(datapath("large_tag_doc_10_iter50"))

Train backward translation

.. sourcecode:: pycon

    >>> model_trans = BackMappingTranslationMatrix(data, src_model, dst_model)
    >>> trans_matrix = model_trans.train(data)


Apply model

.. sourcecode:: pycon

    >>> result = model_trans.infer_vector(dst_model.dv[data[3].tags])


References
----------
.. [1] Dinu, Georgiana, Angeliki Lazaridou, and Marco Baroni. "Improving zero-shot learning by mitigating the
       hubness problem", https://arxiv.org/abs/1412.6568
.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
       "Distributed Representations of Words and Phrases and their Compositionality", https://arxiv.org/abs/1310.4546

    N)OrderedDict)utilsc                   6    e Zd ZdZd Zedd            Zd ZdS )Spacez3An auxiliary class for storing the the words space.c                 x    || _         || _        i | _        t          | j                  D ]\  }}|| j        |<   dS )z
        Parameters
        ----------
        matrix : iterable of numpy.ndarray
            Matrix that contains word-vectors.
        index2word : list of str
            Words which correspond to the `matrix`.

        N)mat
index2word
word2index	enumerate)selfmatrixr	   idxwords        @lib/python3.11/site-packages/gensim/models/translation_matrix.py__init__zSpace.__init__o   sQ     $ "4?33 	( 	(IC$'DOD!!	( 	(    Nc                 n   g }g }|N|D ]J}|                     |           |                     |j        |                    |                              KnR|j        D ]J}|                     |           |                     |j        |                    |                              Kt	          ||          S )a  Construct a space class for the lexicon, if it's provided.

        Parameters
        ----------
        lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`
            Model from which the vectors will be extracted.
        lexicon : list of str, optional
            Words which contains in the `lang_vec`, if `lexicon = None`, the lexicon is all the lang_vec's word.

        Returns
        -------
        :class:`~gensim.models.translation_matrix.Space`
            Object that stored word-vectors

        )appendvectors	get_indexindex_to_keyr   )clslang_veclexiconwordsr   items         r   buildzSpace.build   s    &  		G G GT"""

8+H,>,>t,D,DEFFFFG
 !- G GT"""

8+H,>,>t,D,DEFFFFS%   r   c                     | j         t          j        t          j        t          j        | j                   dd                    z  | _         dS )z"Normalize the word vectors matrix.   T)axiskeepdimsN)r   npsqrtsumsquare)r   s    r   	normalizezSpace.normalize   s<    8bgbfRYtx-@-@qSW&X&X&XYYYr   )N)__name__
__module____qualname____doc__r   classmethodr   r&    r   r   r   r   l   s^        ==( ( ($ ! ! ! [!BZ Z Z Z Zr   r   c                   >     e Zd ZdZd
dZd Z fdZd Zdd	Z xZ	S )TranslationMatrixa0  Objects of this class realize the translation matrix which maps the source language to the target language.
    The main methods are:

    We map it to the other language space by computing z = Wx, then return the
    word whose representation is close to z.

    For details on use, see the tutorial notebook [3]_

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.models import KeyedVectors
        >>> from gensim.test.utils import datapath
        >>> en = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")
        >>> it = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")
        >>> model_en = KeyedVectors.load_word2vec_format(en)
        >>> model_it = KeyedVectors.load_word2vec_format(it)
        >>>
        >>> word_pairs = [
        ...     ("one", "uno"), ("two", "due"), ("three", "tre"), ("four", "quattro"), ("five", "cinque"),
        ...     ("seven", "sette"), ("eight", "otto"),
        ...     ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"),
        ...     ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana")
        ... ]
        >>>
        >>> trans_model = TranslationMatrix(model_en, model_it)
        >>> trans_model.train(word_pairs)
        >>> trans_model.translate(["dog", "one"], topn=3)
        OrderedDict([('dog', [u'cane', u'gatto', u'cavallo']), ('one', [u'uno', u'due', u'tre'])])


    References
    ----------
    .. [3] https://github.com/RaRe-Technologies/gensim/blob/3.2.0/docs/notebooks/translation_matrix.ipynb

    Nc                    d| _         d| _        || _        || _        t	          j        |          | _        d| _        d| _        d| _	        |?t          |d                   dk    rt          d          |                     |           dS dS )a  
        Parameters
        ----------
        source_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`
            Word vectors for source language.
        target_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`
            Word vectors for target language.
        word_pairs : list of (str, str), optional
            Pairs of words that will be used for training.
        random_state : {None, int, array_like}, optional
            Seed for random state.

        Nr      zBEach training data item must contain two different language words.)source_wordtarget_wordsource_lang_vectarget_lang_vecr   get_random_staterandom_statetranslation_matrixsource_spacetarget_spacelen
ValueErrortrain)r   r3   r4   
word_pairsr6   s        r   r   zTranslationMatrix.__init__   s      ..!2<@@"&   	#:a=!!Q& g !efffJJz"""""	# 	#r   c                 h    t          | \   _         _        t                               j        t           j                             _        t                               j        t           j                             _	         j        
                                  j	        
                                  j        j         fd j        D             ddf         } j	        j         fd j        D             ddf         }t          j                            ||d          d          _        dS )zBuild the translation matrix to map from source space to target space.

        Parameters
        ----------
        word_pairs : list of (str, str), optional
            Pairs of words that will be used for training.

        c                 4    g | ]}j         j        |         S r,   )r8   r
   .0r   r   s     r   
<listcomp>z+TranslationMatrix.train.<locals>.<listcomp>   $    #d#d#d4D$5$@$F#d#d#dr   Nc                 4    g | ]}j         j        |         S r,   )r9   r
   r@   s     r   rB   z+TranslationMatrix.train.<locals>.<listcomp>   rC   r   r   )zipr1   r2   r   r   r3   setr8   r4   r9   r&   r   r"   linalglstsqr7   )r   r=   m1m2s   `   r   r<   zTranslationMatrix.train   s    .1*-=*$*!KK(<c$BR>S>STT!KK(<c$BR>S>STT##%%%##%%%"#d#d#d#dSWSc#d#d#dfgfgfg#gh"#d#d#d#dSWSc#d#d#dfgfgfg#gh"$)//"b""="=a"@r   c                 ~    |                     dddg          |d<    t          t          |           j        |i | dS )zcSave the model to a file. Ignores (doesn't store) the `source_space` and `target_space` attributes.ignorer8   r9   N)getsuperr.   save)r   argskwargs	__class__s      r   rP   zTranslationMatrix.save  sI    !::h0PQQx+&&+T<V<<<<<r   c                 f    t          t          j        |j        | j                  |j                  S )a  Map the source word vector to the target word vector using translation matrix.

        Parameters
        ----------
        words_space : :class:`~gensim.models.translation_matrix.Space`
            `Space` object constructed for the words to be translated.

        Returns
        -------
        :class:`~gensim.models.translation_matrix.Space`
            `Space` object constructed for the mapped words.

        )r   r"   dotr   r7   r	   )r   words_spaces     r   apply_transmatz TranslationMatrix.apply_transmat  s(     RVKOT-DEE{G]^^^r      r   c                 "   t          |t                    r|g}|t          j        d           | j        }|t          j        d           | j        }|r|t          d          t          |j                  }t          |t          |          t          |          z
            }| j                            t          |                    |                    |          }t                              |t          |                              t          |                              }	nt                              ||          }	t                              |          }
|	                                 |
                                 |                     |	          }t)          j        |
j        |j        j                   }|rDt)          j        t)          j        |d          d          }t)          j        ||z   d          }nt)          j        |d          }t3                      }t5          |          D ]S\  }}g }t7          |          D ]7}|||	j        |         f         }|                    |
j        |                    8|||<   T|S )a  Translate the word from the source language to the target language.

        Parameters
        ----------
        source_words : {str, list of str}
            Single word or a list of words to be translated
        topn : int, optional
            Number of words that will be returned as translation for each `source_words`
        gc : int, optional
            Define translation algorithm, if `gc == 0` - use standard NN retrieval,
            otherwise, use globally corrected neighbour retrieval method (as described in [1]_).
        sample_num : int, optional
            Number of words to sample from the source lexicon, if `gc == 1`, then `sample_num` **must** be provided.
        source_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional
            New source language vectors for translation, by default, used the model's source language vector.
        target_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional
            New target language vectors for translation, by default, used the model's target language vector.

        Returns
        -------
        :class:`collections.OrderedDict`
            Ordered dict where each item is `word`: [`translated_word_1`, `translated_word_2`, ...]

        NzfThe parameter source_lang_vec isn't specified, use the model's source language word vector as default.zfThe parameter target_lang_vec isn't specified, use the model's target language word vector as default.zWhen using the globally corrected neighbour retrieval method, the `sample_num` parameter(i.e. the number of words sampled from source space) must be provided.r   )r    r   )
isinstancestrwarningswarnr3   r4   RuntimeErrorrG   r   minr:   r6   choicelist
differencer   r   unionr&   rW   r"   rU   r   Targsortr   r   ranger
   r   r	   )r   source_wordstopngc
sample_numr3   r4   r   additionr8   r9   mapped_source_space
sim_matrixsrtd_idxsim_matrix_idxtranslated_wordr   r   translated_target_wordjmap_space_ids                        r   	translatezTranslationMatrix.translate  s   2 lC(( 	*(>L  	3MJ   #2O 	3MJ   #2O  	F "w   /677G:s7||c,6G6G'GHHH'..tG4F4F|4T4T/U/UW_``G ;;L8I8I8O8OPST[P\P\8]8]^^LL ;;EEL{{?55 	       #11,?? f\-/B/F/HIII
  	<z"*Za"@"@"@qIIIHZ:(=AFFFNNZ
;;;N &--"<00 	; 	;IC%'"4[[ U U-a1H1N.NO&--l.El.STTTT$:OD!!r   NN)rX   r   NNN)
r'   r(   r)   r*   r   r<   rP   rW   rt   __classcell__)rS   s   @r   r.   r.      s        $ $J# # # #<A A A,= = = = =
_ _ _ V V V V V V V Vr   r.   c                   &    e Zd ZdZddZd Zd ZdS )BackMappingTranslationMatrixa  Realize the BackMapping translation matrix which maps the source model's document vector
    to the target model's document vector (old model).

    BackMapping translation matrix is used to learn a mapping for two document vector spaces which we
    specify as source document vector and target document vector. The target document vectors are trained
    on a superset corpus of source document vectors; we can incrementally increase the vector in
    the old model through the BackMapping translation matrix.

    For details on use, see the tutorial notebook [3]_.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.test.utils import datapath
        >>> from gensim.test.test_translation_matrix import read_sentiment_docs
        >>> from gensim.models import Doc2Vec, BackMappingTranslationMatrix
        >>>
        >>> data = read_sentiment_docs(datapath("alldata-id-10.txt"))[:5]
        >>> src_model = Doc2Vec.load(datapath("small_tag_doc_5_iter50"))
        >>> dst_model = Doc2Vec.load(datapath("large_tag_doc_10_iter50"))
        >>>
        >>> model_trans = BackMappingTranslationMatrix(src_model, dst_model)
        >>> trans_matrix = model_trans.train(data)
        >>>
        >>> result = model_trans.infer_vector(dst_model.dv[data[3].tags])

    Nc                     || _         || _        || _        t          j        |          | _        d| _        ||                     |           dS dS )a_  

        Parameters
        ----------
        source_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec`
            Source Doc2Vec model.
        target_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec`
            Target Doc2Vec model.
        tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional.
            Documents that will be used for training, both the source language document vector and
            target language document vector trained on those tagged documents.
        random_state : {None, int, array_like}, optional
            Seed for random state.

        N)tagged_docsr3   r4   r   r5   r6   r7   r<   )r   r3   r4   rz   r6   s        r   r   z%BackMappingTranslationMatrix.__init__  sa      '..!2<@@"& 	$JJ{#####	$ 	$r   c                       fd|D             } fd|D             }t           j                            ||d          d          _         j        S )a)  Build the translation matrix to map from the source model's vectors to target model's vectors

        Parameters
        ----------
        tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, Documents
            that will be used for training, both the source language document vector and
            target language document vector trained on those tagged documents.

        Returns
        -------
        numpy.ndarray
            Translation matrix that maps from the source model's vectors to target model's vectors.

        c                 b    g | ]+}j         j        |j                                                 ,S r,   )r3   dvtagsflattenr@   s     r   rB   z6BackMappingTranslationMatrix.train.<locals>.<listcomp>  2    SSStd"%di088::SSSr   c                 b    g | ]+}j         j        |j                                                 ,S r,   )r4   r}   r~   r   r@   s     r   rB   z6BackMappingTranslationMatrix.train.<locals>.<listcomp>  r   r   rE   r   )r"   rH   rI   r7   )r   rz   rJ   rK   s   `   r   r<   z"BackMappingTranslationMatrix.train  s`     TSSS{SSSSSSS{SSS"$)//"b""="=a"@&&r   c                 6    t          j        || j                  S )a|  Translate the target model's document vector to the source model's document vector

        Parameters
        ----------
        target_doc_vec : numpy.ndarray
            Document vector from the target document, whose document are not in the source model.

        Returns
        -------
        numpy.ndarray
            Vector `target_doc_vec` in the source model.

        )r"   rU   r7   )r   target_doc_vecs     r   infer_vectorz)BackMappingTranslationMatrix.infer_vector  s     vnd&=>>>r   ru   )r'   r(   r)   r*   r   r<   r   r,   r   r   rx   rx   p  sP         8$ $ $ $4' ' '*? ? ? ? ?r   rx   )r*   r\   collectionsr   numpyr"   gensimr   r   SaveLoadr.   rx   r,   r   r   <module>r      s   ^ ^@  # # # # # #          9Z 9Z 9Z 9Z 9Z 9Z 9Z 9ZxE E E E E E E EPZ? Z? Z? Z? Z?5> Z? Z? Z? Z? Z?r   