
    cMo                        d Z ddlZddlZddlZddlZddlm	Z	 ddl
mZ ddl
mZ ddl
mZ ddlmZ ddlmZmZ dd	lmZ  ej        e          ZddZ eej                  dk    Z G d dej        ej                  ZdS )a   Online Non-Negative Matrix Factorization.
Implementation of the efficient incremental algorithm of Renbo Zhao, Vincent Y. F. Tan et al.
`[PDF] <https://arxiv.org/abs/1604.02634>`_.

This NMF implementation updates in a streaming fashion and works best with sparse corpora.

- W is a word-topic matrix
- h is a topic-document matrix
- v is an input corpus batch, word-document matrix
- A, B - matrices that accumulate information from every consecutive chunk. A = h.dot(ht), B = v.dot(ht).

The idea of the algorithm is as follows:

.. code-block:: text

    Initialize W, A and B matrices

    Input the corpus
    Split the corpus into batches

    for v in batches:
        infer h:
            do coordinate gradient descent step to find h that minimizes (v - Wh) l2 norm

            bound h so that it is non-negative

        update A and B:
            A = h.dot(ht)
            B = v.dot(ht)

        update W:
            do gradient descent step to find W that minimizes 0.5*trace(WtWA) - trace(WtB) l2 norm

Examples
--------

Train an NMF model using a Gensim corpus

.. sourcecode:: pycon

    >>> from gensim.models import Nmf
    >>> from gensim.test.utils import common_texts
    >>> from gensim.corpora.dictionary import Dictionary
    >>>
    >>> # Create a corpus from a list of texts
    >>> common_dictionary = Dictionary(common_texts)
    >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    >>>
    >>> # Train the model on the corpus.
    >>> nmf = Nmf(common_corpus, num_topics=10)

Save a model to disk, or reload a pre-trained model

.. sourcecode:: pycon

    >>> from gensim.test.utils import datapath
    >>>
    >>> # Save model to disk.
    >>> temp_file = datapath("model")
    >>> nmf.save(temp_file)
    >>>
    >>> # Load a potentially pretrained model from disk.
    >>> nmf = Nmf.load(temp_file)

Infer vectors for new documents

.. sourcecode:: pycon

    >>> # Create a new corpus, made of previously unseen documents.
    >>> other_texts = [
    ...     ['computer', 'time', 'graph'],
    ...     ['survey', 'response', 'eps'],
    ...     ['human', 'system', 'computer']
    ... ]
    >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
    >>>
    >>> unseen_doc = other_corpus[0]
    >>> vector = Nmf[unseen_doc]  # get topic probability distribution for a document

Update the model by incrementally training on the new corpus

.. sourcecode:: pycon

    >>> nmf.update(other_corpus)
    >>> vector = nmf[unseen_doc]

A lot of parameters can be tuned to optimize training for your specific case

.. sourcecode:: pycon

    >>> nmf = Nmf(common_corpus, num_topics=50, kappa=0.1, eval_every=5)  # decrease training step size

The NMF should be used whenever one needs extremely fast and memory optimized topic model.

    N)halfnorm)
interfaces)matutils)utilsTransformedCorpus)	basemodelCoherenceModel)solve_h   c                 |    t          t          t          |                     d          d |                             S )N.)tuplemapintsplit)versionprefixs     1lib/python3.11/site-packages/gensim/models/nmf.pyversion_tupler   s   s/    S'--,,WfW566777    )r      c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dZd$dZd$dZd%dZd&dZd&dZ		 	 d'dZ
d(dZ	 	 d(dZd Zd Zd)dZd Zd$dZd  Zed!             Zd(d"ZdS )*NmfzOnline Non-Negative Matrix Factorization.

    `Renbo Zhao et al :"Online Nonnegative Matrix Factorization with Outliers" <https://arxiv.org/abs/1604.02634>`_

    Nd              ?{Gz?   -C6?2   MbP?
   Tc                    || _         || _        || _        || _        || _        || _        || _        |	| _        |
| _        || _	        || _
        || _        t          j        |          | _        d| _        | j        t          j        |          | _        t#          | j                  | _        d| _        d| _        d| _        d| _        t.          j        | _        d| _        ||                     |           dS dS )aE	  

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents), optional
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        num_topics : int, optional
            Number of topics to extract.
        id2word: {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}
            Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
            debugging and topic printing.
        chunksize: int, optional
            Number of documents to be used in each training chunk.
        passes: int, optional
            Number of full passes over the training corpus.
            Leave at default `passes=1` if your input is an iterator.
        kappa : float, optional
            Gradient descent step size.
            Larger value makes the model train faster, but could lead to non-convergence if set too large.
        minimum_probability:
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        w_max_iter: int, optional
            Maximum number of iterations to train W per each batch.
        w_stop_condition: float, optional
            If error difference gets less than that, training of ``W`` stops for the current batch.
        h_max_iter: int, optional
            Maximum number of iterations to train h per each batch.
        h_stop_condition: float
            If error difference gets less than that, training of ``h`` stops for the current batch.
        eval_every: int, optional
            Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.
        random_state: {np.random.RandomState, int}, optional
            Seed for random generator. Needed for reproducibility.

        N)
num_topicsid2word	chunksizepasses_kappaminimum_probability_w_max_iter_w_stop_condition_h_max_iter_h_stop_condition
eval_every	normalizer   get_random_staterandom_statev_maxdict_from_corpuslen
num_tokensAB_Ww_stdnpinf_w_error_hupdate)selfcorpusr&   r'   r(   r)   kappar+   
w_max_iterw_stop_condition
h_max_iterh_stop_conditionr0   r1   r3   s                  r   __init__zNmf.__init__   s    v %"#6 %!1%!1$"!2<@@
< 	: 1&99DLdl++
 	 KK	  	 r   c                     | j         j        }|| j        }|r-||                    d                              dd          z  S |S )a  Get the term-topic matrix learned during inference.

        Parameters
        ----------
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        numpy.ndarray
            The probability for each word in each topic, shape (`num_topics`, `vocabulary_size`).

        Nr   )axis)r:   Tr1   sumreshape)rA   r1   dense_topicss      r   
get_topicszNmf.get_topics   sW     wy 	'I 	J,"2"2"2":":"B"B2q"I"IIIr   c                 .    |                      ||          S N)get_document_topics)rA   bowepss      r   __getitem__zNmf.__getitem__   s    ''S111r   Fc                     | j         }t          j         j        j        d                   } j        D ]}||dk    z  }| j        j        d         z  }|dk     s| j        k    r j        }t          |          }nRt          | j                  }t          t          j
        |                    }	|	d|dz           |	| dz  d         z   }g }
                     |          }|D ]}||         t          j
        |d                                          } fd|D             |rd	                    d
 D                       |
                    |f           |r#t                              d|||                    |
S )a  Get the topics sorted by sparsity.

        Parameters
        ----------
        num_topics : int, optional
            Number of topics to be returned. Unlike LSA, there is no natural ordering between the topics in NMF.
            The returned topics subset of all topics is therefore arbitrary and may change between two NMF
            training runs.
        num_words : int, optional
            Number of words to be presented for each topic. These will be the most relevant words (assigned the highest
            probability for each topic).
        log : bool, optional
            Whether the result is also logged, besides being returned.
        formatted : bool, optional
            Whether the topic representations should be formatted as strings. If False, they are returned as
            2 tuples of (word, probability).
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of {str, tuple of (str, float)}
            a list of topics, each represented either as a string (when `formatted` == True) or word-probability
            pairs.

        Nr   r   r   r1   Treversec                 :    g | ]}j         |         |         fS  r'   ).0idrA   topics     r   
<listcomp>z#Nmf.show_topics.<locals>.<listcomp>/  s)    CCCrdl2&b	2CCCr   z + c                 "    g | ]\  }}d ||fz  S )z	%.3f*"%s"r\   )r^   kvs      r   ra   z#Nmf.show_topics.<locals>.<listcomp>1  s%    #K#K#KTQK1a&$8#K#K#Kr   ztopic #%i (%.3f): %s)r1   r<   zerosr:   shaper&   rangeminlistr   argsortrP   raveljoinappendloggerinfo)rA   r&   	num_wordslog	formattedr1   sparsityrowchosen_topicssorted_topicsshowntopicsibestnr`   s   `             @r   show_topicszNmf.show_topics   s   6  	'I 8DGM!,--7 	# 	#C"HHDGM!$$> 		Z4?: 		J!*--MMZ99J !1(!;!;<<M/
a/0=*PQAQARAR3SS  955 		K 		KA1IE$UItDDDJJLLECCCCCUCCCE M

#K#KU#K#K#KLLLL!U$$$ K2Ax{EJJJr   c                 ^     | j         } fd                     |||          D             S )a  Get the representation for a single topic. Words here are the actual strings, in constrast to
        :meth:`~gensim.models.nmf.Nmf.get_topic_terms` that represents words by their vocabulary ID.

        Parameters
        ----------
        topicid : int
            The ID of the topic to be returned
        topn : int, optional
            Number of the most significant words that are associated with the topic.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of (str, float)
            Word - probability pairs for the most relevant words generated by the topic.

        Nc                 4    g | ]\  }}j         |         |fS r\   r]   )r^   r_   valuerA   s      r   ra   z"Nmf.show_topic.<locals>.<listcomp>O  s9     
 
 
E \"u%
 
 
r   rX   )r1   get_topic_terms)rA   topicidtopnr1   s   `   r   
show_topiczNmf.show_topic9  s]    &  	'I
 
 
 
!11'4<E 2 G G
 
 
 	
r   c                     | j         dd|f         || j        }|r                                z  t          j        |d          }fd|D             S )a  Get the representation for a single topic. Words the integer IDs, in constrast to
        :meth:`~gensim.models.nmf.Nmf.show_topic` that represents words by the actual strings.

        Parameters
        ----------
        topicid : int
            The ID of the topic to be returned
        topn : int, optional
            Number of the most significant words that are associated with the topic.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of (int, float)
            Word ID - probability pairs for the most relevant words generated by the topic.

        NTrY   c                 $    g | ]}||         fS r\   r\   )r^   idxr`   s     r   ra   z'Nmf.get_topic_terms.<locals>.<listcomp>p  s"    333ceCj!333r   )r:   r1   rM   r   rj   )rA   r   r   r1   rz   r`   s        @r   r   zNmf.get_topic_termsU  ss    & 7
# 	'I 	!UYY[[ E d;;;3333U3333r   u_mass   rK   c           
      J    t           |||||||          }|                                }	g }
                                 D ]=t          j        |d          } fd|D             }|
                    |           >t          |
|	          }t          |d d          S )a  Get the topics sorted by coherence.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        texts : list of list of str, optional
            Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
            probability estimator .
        dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Dictionary mapping of id word to create corpus.
            If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
        window_size : int, optional
            Is the size of the window to be used for coherence measures using boolean sliding window as their
            probability estimator. For 'u_mass' this doesn't matter.
            If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
        coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
            Coherence measure to be used.
            Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
            For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
            using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
        topn : int, optional
            Integer corresponding to the number of top words to be extracted from each topic.
        processes : int, optional
            Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
            num_cpus - 1.

        Returns
        -------
        list of (list of (int, str), float)
            Each element in the list is a pair of a topic representation and its coherence score. Topic representations
            are distributions of words, represented as a list of pairs of word IDs and their probabilities.

        )modelrB   texts
dictionarywindow_size	coherencer   	processesT)r   rZ   c                 :    g | ]}|         j         |         fS r\   r]   )r^   _idrA   r`   s     r   ra   z"Nmf.top_topics.<locals>.<listcomp>  s)    HHH3c
DL$56HHHr   c                     | d         S Nr   r\   )tups    r   <lambda>z Nmf.top_topics.<locals>.<lambda>  s
    SV r   )keyrZ   )r
   get_coherence_per_topicrP   r   rj   rm   zipsorted)rA   rB   r   r   r   r   r   r   cmcoherence_scores
str_topicsrz   beststrscored_topicsr`   s   `             @r   
top_topicszNmf.top_topicsr  s    N vUz#yt
 
 

 5577
__&& 	' 	'E$UtDDDEHHHHH%HHHGg&&&&J(899m););TJJJJr   c                    || j         }t          |d          }t          |t                    r'| j                            |g          d         d         }g }| j        |         }|| j        }|r/|                                dk    r||                                z  }t          d| j
                  D ]'}||         }||k    r|                    ||f           (|S )a:  Get the most relevant topics to the given word.

        Parameters
        ----------
        word_id : int
            The word for which the topic distribution will be computed.
        minimum_probability : float, optional
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of (int, float)
            The relevant topics represented as pairs of their ID and their assigned probability, sorted
            by relevance to the given word.

        N:0yE>r   )r+   max
isinstancestrr'   doc2bowr:   r1   rM   rg   r&   rm   )rA   word_idr+   r1   valuesword_topicstopic_id	word_coefs           r   get_term_topicszNmf.get_term_topics  s    *  	;"&":!"5t<< gs## 	<l**G955a8;Ggg& 	'I 	-**Q. 	-;??,,,Ka11 	5 	5H#H-I// 5x3444r   c                    | j         t          d          t          j        |          \  }}|rt	                    } | j        |fi |S t          j        |g| j                  }| 	                    || j
        t          j                  }|| j        }|r|                                }	|	r||	z  }fdt          |dddf                   D             S )a0  Get the topic distribution for the given document.

        Parameters
        ----------
        bow : list of (int, float)
            The document in BOW format.
        minimum_probability : float
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of (int, float)
            Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and
            the probability that was assigned to it.

        Nr   )r+   )r4   c                 .    g | ]\  }}r|k    ||fS r\   r\   )r^   r   probar+   s      r   ra   z+Nmf.get_document_topics.<locals>.<listcomp>  sG     
 
 
U&
 +02E*E
%L
 
 
r   r   )r+   r   r   	is_corpusdict_applyr   
corpus2cscr7   
_solveprojr:   r<   r=   r1   rM   	enumerate)
rA   rT   r+   r1   r   rB   kwargsrd   hthe_sums
     `       r   rS   zNmf.get_document_topics  s   ,  	;"&":!"5t<< "OC00	6 	1.ABBBF4;v00000t77OOAtwbfO55 	'I 	eeggG W
 
 
 
'!!!Q$00
 
 
 	
r   c                    t          j        |                                | j        | j        z  z            | _        t          j        | j        t          j        | j        | j        f| j	                  z            | _
        t          j        | j        | j        f          | _        t          j        | j        | j        f          | _        dS )zInfer info from the first batch and initialize the matrices.

        Parameters
        ----------
        v : `csc_matrix` with the shape (n_tokens, chunksize)
            Batch of bows.

        )sizer3   N)r<   sqrtmeanr7   r&   r;   absr   rvsr3   r:   re   r8   r9   )rA   rd   s     r   _setupz
Nmf._setup  s     WQVVXX4?)JKLL
&Jlot7dFW  
 
 4?DO<==4?DO<==r   c           
         | j         j        }d}t          |j        | j        j                  D ]D\  }}|t	          j        t	          j        ||                    |          z
                      z  }Et	          j        |          S )Nr   )	r:   rL   r   r?   r<   rM   squaredotr   )rA   rd   Wtl2doc
doc_topicss         r   l2_normzNmf.l2_norm  st    WY"13	22 	@ 	@OC"&C*..*<*<$<>>???BBwr{{r   c                     | j         }| j        }t          j        }t	          t
          j        j        j                  rj	        d         }n;	 t                    }n*# t          $ r t                              d           Y nw xY w|t          | j                  }t          ||pd|z            }|dk    rt                              d           dS t	          t"          j        j                  r j         dk    rt)          d          t                              d j        ||dn||           d}t-          |          D ]}t	          t
          j        j        j                  r/ fd	t-          dj	        d          j                  D             }	nt/          j         j                  }	t3          |	          D ]\  }
}t	          t
          j        j        j                  r=|dd j                            |j	        d                   f         }|j	        d         }nD j                            |           t;          j        | j        
          }t          |          }t          j         |          r#t                              d||
|z  |z              n#t                              d||
|z  |z   |            j!         "                    |            #                    | j!         j$         j%                   _$         j$        }|r[|
dz   |z  |k    s|
dz   |z  dk    rCt                              d &                    |                      '                    d            xj(        |dz
  z  c_(         xj(        |)                    |j*                  z  c_(         xj(        |z  c_(         xj+        |dz
  z  c_+         xj+        |)                    |j*                  z  c_+         xj+        |z  c_+         ,                                 |dz  }t                              d j-                   dS )a  Train the model with new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        chunksize: int, optional
            Number of documents to be used in each training chunk.
        passes: int, optional
            Number of full passes over the training corpus.
            Leave at default `passes=1` if your input is an iterator.
        eval_every: int, optional
            Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low.

        Nr   z input corpus stream has no len()r   z(Nmf.update() called with an empty corpusz0Corpus is an iterator, only `passes=1` is valid.zzrunning NMF training, %s topics, %i passes over the supplied corpus of %s documents, evaluating L2 norm every %i documentszunknown number ofc           	   3   v   K   | ]3}d d |t          j        d         |j        z             f         V  4d S r   )rh   rf   r(   )r^   col_idxrB   rA   s     r   	<genexpr>zNmf.update.<locals>.<genexpr>`  sc        
   111gc&,q/7T^;S&T&TTTU	     r   )	num_termsz"PROGRESS: pass %i, at document #%iz%PROGRESS: pass %i, at document #%i/%i)r   r4   zL2 norm: %s   zW error: %s).r)   r0   r<   r=   r   scipysparsecsc
csc_matrixrf   r6   	TypeErrorrn   ro   rh   r(   warningcollectionsabcIterator
ValueErrorr&   rg   r   grouperr   r3   permutationshuffler   r   r7   isinfr:   r   r   r?   r4   r   print_topicsr8   r   rL   r9   _solve_wr>   )rA   rB   r(   r)   r0   	lencorpus	evalafterchunk_overall_idxpass_r   	chunk_idxchunkrd   	chunk_lenr   s   ``             r   r@   z
Nmf.update'  s   (  	![F 	)JF	fel.9:: 	@QII@KK		 @ @ @>?????@  	7It~66I	JO!y#@AA	> 	NNEFFFFfko677 	QDK!O 	QOPPP&OVI%\%8%8S\^g	
 	
 	
 6]] @	: @	:E&%,"2"=>> 
@     QQ@@    -??$-g$6$6 3: 3: 	5fel&6&ABB +aaa!2!>!>u{1~!N!NNOA !
II%--e444 +"&/  A
 !$E

I8I&& 	KK<y94y@   
 KK?y94y@)  
 7 # KKNNN//!TWtz/RRG )Y]i$?9$L )R[^_R_cmQmqrQr )KKt||A???%%a(((+a//!%%**$+++a//!%%**$++!Q&!M4=9999g3:@	: @	:s   A$ $$B
Bc                 0     fd} j         t          j                             j                  z  }t           j                  D ]}t                              d j	                    j
                             j                  } xj
        || j        z
  z  z  c_
                                           ||          } j	        t          j        k     r7t          j        | j	        z
   j	        z             j        k     r
| _	         dS | _	        dS )z	Update W.c                     dt          j        d| j                  z  t          j        dj        j                  z
  S )z7An optimized version of 0.5 * trace(WtWA) - trace(WtB).g      ?zij,ij)r<   einsumr:   r9   )WArA   s    r   errorzNmf._solve_w.<locals>.error  s7    7B88829WdgW[W];^;^^^r   zw_error: %sN)r*   r<   linalgnormr8   rg   r,   rn   debugr>   r:   r   r9   
_transformr=   r   r-   )rA   r   etaiter_numberr   error_s   `     r   r   zNmf._solve_w  s   	_ 	_ 	_ 	_ 	_ kBINN46222 !122 	# 	#KLL666TV$$BGGsb46k**GGOOU2YYF &FFT]2dmCDDtG]] !'"DMM#	# 	#r   c                      t          | ||fi |S )a  Apply the transformation to a whole corpus and get the result as another corpus.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        chunksize : int, optional
            If provided, a more effective processing will performed.

        Returns
        -------
        :class:`~gensim.interfaces.TransformedCorpus`
            Transformed corpus.

        r   )rA   rB   r(   r   s       r   r   z
Nmf._apply  s    & !vyCCFCCCr   c                    t          j        | j        d| j        | j                   t          j        t          j        d| j        | j                            }t          j        |d|           | xj        |z  c_        dS )zApply boundaries on W.r   )outzij,ij->jr   N)r<   clipr:   r4   r   r   maximum)rA   sumsqs     r   r   zNmf._transform  sm    
DJDG4444	*dgtw??@@

5!''''5r   c                     t           r$|j                            | j                  j        S t          j        j                            | |          S rR   )	OLD_SCIPYrL   r   r   r   r   )denser   s     r   _dense_dot_csczNmf._dense_dot_csc  s>     	;EIIeg&&))<*..uc:::r   c                    |j         \  }}||| _        n | j        |                                | _        |j         d         }||f}||j         |k    rt          j        |          }|j        }	|	                    |          }
d}t          | j                  D ]}t          
                    d|           |                     |	|          }| j                            | j                                      t          j                  }t#          |||
|| j                  }||z  }|r"t          j        ||z
            | j        k     r n|}|S )a]  Update residuals and representation (h) matrices.

        Parameters
        ----------
        v : scipy.sparse.csc_matrix
            Subset of training corpus.
        W : ndarray
            Dictionary matrix.
        h : ndarray
            Representation matrix.
        v_max : float
            Maximum possible value in matrices.

        Nr   zh_error: %s)rf   r4   r   r<   re   rL   r   rg   r.   rn   r   r   r3   r   r&   astypeint32r   r*   r   r/   )rA   rd   Wr   r4   mn
batch_sizehshaper   WtWh_errorr   Wtvr   r   s                   r   r   zNmf._solveproj  sQ    w1 	!DJJZ 	!DJWQZ
Z 	!6) 	!  ASffQii !122 	 	KLL000%%b!,,C+77HHOOPRPXYYKQS+t{CCFaKF 26'F"233d6LL GGr   )Nr   Nr   r   r   r   r    r!   r"   r#   r$   TNrR   )r$   r$   FTN)r$   N)NNNr   r   rK   )NN)NNN)__name__
__module____qualname____doc__rH   rP   rV   r{   r   r   r   r   rS   r   r   r@   r   r   r   staticmethodr   r   r\   r   r   r   r   z   s          Z  Z  Z  Z x   ,2 2 2 2A A A AF
 
 
 
84 4 4 4: KO:<5K 5K 5K 5Kn, , , ,\ <@&*/
 /
 /
 /
b> > >*  w: w: w: w:r# # #8D D D D*   ; ; \;0 0 0 0 0 0r   r   )r   )r  collections.abcr   loggingnumpyr<   scipy.sparser   scipy.statsr   gensimr   r   r   gensim.interfacesr   gensim.modelsr	   r
   gensim.models.nmf_pgdr   	getLoggerr  rn   r   __version__r   TransformationABCBaseTopicModelr   r\   r   r   <module>r     sJ  ^ ^B                                            / / / / / / 3 3 3 3 3 3 3 3 ) ) ) ) ) )		8	$	$8 8 8 8 M%+,,7	U
 U
 U
 U
 U
*
&	(@ U
 U
 U
 U
 U
r   