
    c                         d Z ddlZddlmZ ddlZddlZddlZddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  G d	 d
          Z G d de          Z G d de          Z G d de          ZdS )z8
Created on Thu Oct 27 11:04:27 2022

@author: 20200016
    N)Counter)svds)
dok_matrix)
Clustering)CoherenceModel)Word2Vecc                   n   e Zd ZdZ	 	 	 	 	 	 	 	 	 d5dZd Zed	             Zed
             Zd Z	ed             Z
ed             Zed             Zed             Z	 	 	 	 	 d6dZd Zd Zed             Zd Zed             Zed             Zed             Ze	 d7d            Zed             Zed             Zed             Zed             Z	 	 	 	 d8dZed             Zed             Z	 	 	 	 d9d"Z	 	 	 	 d:d%Z	 	 	 d;d'Z 	 d<d(Z!	 	 	 d;d)Z"d* Z#d+ Z$d, Z%d- Z&d. Z'd/ Z(d0 Z)d1 Z*d2 Z+d3 Z,d4 Z-dS )=	FlsaModela  
    Class to initialize and train fuzzy topic models with methods similar
    to Gensim's LdaModel'

    Parameters
    ----------
            corpus : The input corpus.
                either: list of list of str.
                or: list of list of tuples (int, int) (bow).

            num_topics: int
                The number of topics to be trained.

            algorithm: str ['flsa', 'flsa-w', 'flsa-e']
                The algorithm to train.

            id2word: gensim.corpora.dictionary.Dictionary
                Object to map id's to words
                (only used when the corpus is passed into the object as a bow).

            word_weighting: str ['normal', 'idf', 'probidf', 'entropy']
                Global term weighting mechanism.

            cluster_method: str ['fcm', 'gk', 'fst-pso']
                Fuzzy clustering method.

            svd_factors: int
                The number of singular values to use.
       normalfcm   Nc                 |   |                      ||          | _        || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        |                                  |                     | j                  \  | _        | _        |                     | j                  \  | _        | _        |                     | j                  | _        d | _        d | _        d | _        d | _        d | _        d | _        d | _        |                                 \  | _        | _        d S N)_set_corpuscorpus
num_topics	algorithm	num_wordsword_weightingcluster_methodsvd_factors	min_countwindowvector_sizeworkers_check_variables_create_vocabulary_vocabulary_vocabulary_size_create_index_dicts_word_to_index_index_to_word_create_sum_words
_sum_words_prob_word_i_prob_document_j_prob_topic_k_prob_word_given_topic_prob_word_given_documentcoherence_scorediversity_score_get_matricespwgtptgd)selfr   r   r   r   r   r   r   id2wordr   r   r   r   s                7lib/python3.11/site-packages/gensim/models/flsamodel.py__init__zFlsaModel.__init__4   s,    &&vw77$"$",,&"&262I2I$+2V2V/$/373K3KDL\3]3]0T000==  $!&*#)-&###1133	4999    c                     |                      |          rDt          |t          j        j                  st          d          |                     ||          S |S )a  
        Method that sets the corpus to FuzzyTM's required input format.
        If a list of list of str is passed into the method for corpus, then
        it returns the same corpus. If a bow (list of list of tuples) is passed
        into the class, it transforms this into a list of list of str.

        Parameters
        ----------
            corpus : either: list of list of str (tokens). or: list of list of tuples (int, int).
                The input corpus.
            id2word: gensim.corpora.dictionary.Dictionary
                Object to map id's to words
                (only used when the corpus is passed into the object as a bow)

        Returns
        -------
            list of list of str
                The corpus in FuzzyTM's required input format.
        z4Please pass 'id2word' when using a bow for 'corpus'.)
_check_bow
isinstancecorpora
dictionary
Dictionary
ValueError_convert_bow)r0   r   r1   s      r2   r   zFlsaModel._set_corpus\   s[    0 ??6"" 	6gw'9'DEE Y !WXXX$$VW555r4   c                    t          | t                    sdS | D ]r}t          |t                    s dS |D ]U}t          |t                    s  dS t          |d         t                    rt          |d         t                    s  dS VsdS )aI  
        Method to check if the input format has the bow format.

        Parameters
        ----------
            corpus : either: list of list of str (tokens). or: list of list of tuples (int, int).
                The input corpus.

        Returns
        -------
            bool
                True if bow format
        Fr      T)r7   listtupleint)r   doctups      r2   r6   zFlsaModel._check_bowz   s    " &$'' 	5 	! 	!Cc4(( uu ! !!#u-- ! 555!#a&#.. !jQ6M6M ! 555!!
 tr4   c                    d |j                                         D             }g }| D ]]}g }|D ]A}t          j        d|d                   D ]#}|                    ||d                             $B|                    |           ^|S )a  
        Method to convert the bow format into a list of list of str.

        Parameters
        ----------
            corpus : The input corpus.
                either: list of list of str (tokens).
                or: list of list of tuples (int, int).

            id2word: gensim.corpora.dictionary.Dictionary
                Object to map id's to words

        Returns
        -------
            list of list of str
                The corpus in FuzzyTM's required input format.
        c                     i | ]\  }}||	S  rF   ).0kvs      r2   
<dictcomp>z*FlsaModel._convert_bow.<locals>.<dictcomp>   s    >>>TQAq>>>r4   Nr>   r   )token2iditems	itertoolsrepeatappend)r   r1   id2token	data_listrB   doc_listrC   _s           r2   r<   zFlsaModel._convert_bow   s    , ?>W%5%;%;%=%=>>>	 	' 	'CH 6 6")$A77 6 6AOOHSV$455556X&&&&r4   c           	         t          | j                  D ]\  }}t          |t                    st	          dt          |          d          t          |          dk    st          dt          |          d          t          |          D ]0\  }}t          |t
                    st	          d| d| d          1t          | j        t                    r| j        d	k     rt          d
          t          | j
        t                    r| j
        d	k     rt          d          | j        dv r/| j        dvr&g d}t          d                    |                    | j        dvrt          d          t          | j        t                    s| j        dk    rt          d          | j        dvrt          d          dS )z
        Check whether the input data has the right format.

        Correct format: list of list of str (tokens)
        The function raises an error if the format is incorrect.
        zcorpus variable at index z is not a listr   z&The corpus has an empty list at index z* and should contain at least one str valuezWord z of document z is not a strr>   z(Please use a positive int for num_topicsz'Please use a positive int for num_wordsflsaflsa-w)entropyidfr   probidf)Invalid word weighting methodzPlease choose between:z''entropy', 'idf', 'normal' and'probidf' )r   zfst-psogkz@Invalid 'cluster_method. Please choose: 'fcm', 'fst-pso' or 'gk'z)Please use a positive int for svd_factors)rV   rW   flsa-ez"Please select a correct "algoritm"N)	enumerater   r7   r?   	TypeErrorstrlenr;   r   rA   r   r   r   joinr   r   )r0   irB   jwordwarnings         r2   r   zFlsaModel._check_variables   sD     ,, 	N 	NFAsc4(( 2 ; #A 02 2 2 s88a< B <FF@B B B %S>> N N4!$,, N#$LA$L$LA$L$L$LMMMNN $/3// 	I4?Q3F 	IGHHH$.#.. 	H$.12D 	HFGGG>   	0 ) 2 	0  G SXXg../// '  	T
 RT T T$*C00 	JT5E5I 	JHIII> "  	C
 ABBB	C 	Cr4   c                 V    t          d | D                       }|t          |          fS )a(  
        Create the vocabulary from 'corpus'.

        Parameters
        ----------
             corpus : list of lists of str
                 The input file used to initialize the model.

        Returns
        -------
            set of str
                All the vocabulary words.
        c              3   $   K   | ]}|D ]}|V  d S r   rF   )rG   lisels      r2   	<genexpr>z/FlsaModel._create_vocabulary.<locals>.<genexpr>   s/      <<<<"<<<<<<<r4   )setrb   )r   
vocabularys     r2   r   zFlsaModel._create_vocabulary   s1     <<v<<<<<
3z??**r4   c                     t          | t                    st          d          t                      }t                      }t	          |           D ]\  }}|||<   |||<   ||fS )a  
        Create the dictionaries with mappings between words and indices.

        Parameters
        ----------
            vocabulary : set of str
                All the words in the corpus.

        Returns
        -------
            dict of {str : int}
                Dictionary that maps a vocabulary word to and index number.
            dict of {int : str}
                Dictionary that maps an index number to each vocabulary word.
        z)Please use a 'set' type for 'vocabulary'.)r7   rm   r;   dictr_   )rn   word_to_indexindex_to_wordrd   rf   s        r2   r!   zFlsaModel._create_index_dicts   su    " *c** 	JHIII ,, 	$ 	$GAt"#M$#M!m++r4   c                 p    t                      }| D ]$}|                    t          |                     %|S )av  
        Creates a Counter object that stores the count of each word in the corpus (corpus).

        Parameters
        ----------
            corpus : list of lists of str
                The input file used to initialize the model.

        Returns
        -------
            collections.Counter {str : int}
                The count of each word in the corpus.
        )r   update)r   	sum_wordsdocuments      r2   r$   zFlsaModel._create_sum_words  sA     II	 	0 	0HWX..////r4   c                     t          t          |           |ft          j                  }t	          |           D ]>\  }}t          |          }|                                D ]}||         ||||         f<   ?|S )a,  
        Creates a sparse matrix showing the frequency of each words in documents.

        (See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.dok_matrix.html)
        Axes:
            rows: documents (size: number of documents in corpus)
            columns: words (size: vocabulary length)

        Parameters
        ----------
            corpus : list of lists of str
                The input file used to initialize the model.
            vocabulary_size : int
                Number of unique words in the corpus.
            word_to_index: dict {str : int}
                Maps each unique vocabulary word to a unique index number.

        Returns
        -------
            scipy.sparse.dok_matrix
                sparse matrix representation of the local term weights.
        dtype)r   rb   npfloat32r_   r   keys)r   vocabulary_sizerq   sparse_local_term_weightsdocument_indexrv   document_counterrf   s           r2   !_create_sparse_local_term_weightsz+FlsaModel._create_sparse_local_term_weights-  s    8 %/[[*% % %!
 )2&(9(9 	/ 	/$NH&x00(--// / / ). *"M$$78 / )(r4   c                 J   t          |          }|dv r|t          d          |dv r"|t          d          |t          d          |dv r|t          d          |d	v r|t          d
          |dk    r|                     |||||          }	ni|dk    r|                     ||||          }	nJ|dk    r|                     |          }	n.|dk    r|                     ||||          }	nt          d          |                    |	                                          S )a  
        Apply a word_weighting method on the sparse_local_term_weights
        to create sparse_global_term_weights.
        (See: https://link.springer.com/article/10.1007/s40815-017-0327-9)

        Parameters
        ----------
            corpus : list of lists of str
                The input file used to initialize the model.
            word_weighting : str
                Indicates the method used for word_weighting. Choose from:
                  - entropy
                  - normal
                  - idf
                  - probidf
             vocabulary_size : int
                Number of unique words in the corpus.
             sparse_local_term_weights : scipy.sparse.dok_matrix
                 A sparse matrix showing the frequency of each words in documents.
             word_to_index : dict {str : int}
                Maps each unique vocabulary word to a unique index number.
             index_to_word : dict {int : str}
                 Maps each unique index number to a unique vocabulary word.
             sum_words : collections.Counter {str : int}
                 The count of each word in the corpus.

        Returns
        -------
            scipy.sparse.dok_matrix
                sparse matrix representation of the global term weights.
        )rX   r   Nz5Please feed the algorithm 'sparse_local_term_weights')rX   z)Please feed the algorithm 'index_to_word'z%Please feed the algorithm 'sum_words')rX   rY   rZ   z+Please feed the algorithm 'vocabulary_size')rY   rZ   z)Please feed the algorithm 'word_to_index'rX   rY   r   rZ   r[   )rb   r;   _calculate_entropy_calculate_idf_calculate_normal_calculate_probidfmultiplytocsc)
r0   r   r   r}   r~   rr   rq   ru   num_documentsglobal_term_weightss
             r2   "_create_sparse_global_term_weightsz,FlsaModel._create_sparse_global_term_weightsV  s   R F22 	Z( Z !XYYY[( 	J N !LMMM J !HIII:: 	P P !NOOO// 	N N !LMMMY& 	>"&"9"9)y	# # u$ 	>"&"5"5	# # x' 
	>"&"8"89R"S"Sy( 	>"&"9"9	# # <===(112EFFLLNNNr4   c                     |                      ||||          }|                    d                                          d         }t          j        fd|D                       S )a  
        Use the entropy word weighting method.

        (See: https://link.springer.com/article/10.1007/s40815-017-0327-9)

        Parameters
        ----------
            num_documents : int
                The number of documents in the corpus.
            vocabulary_size : int
                Number of unique words in the corpus.
            sparse_local_term_weights : scipy.sparse.dok_matrix
                 A sparse matrix showing the frequency of each words in documents.
            index_to_word : dict {int : str}
                 Maps each unique index number to a unique vocabulary word.
            sum_words : collections.Counter {str : int}
                 The count of each word in the corpus.

        Returns
        -------
            numpy.array : float
        r   c                 D    g | ]}d |t          j                  z  z   S r>   rz   log2)rG   summed_p_log_p_ir   s     r2   
<listcomp>z0FlsaModel._calculate_entropy.<locals>.<listcomp>  s/    nnnK[-0F0FFFnnnr4   )_create_p_log_p_ijsumtolistrz   array)r0   r   r}   r~   rr   ru   
p_log_p_ijsummed_p_log_ps    `      r2   r   zFlsaModel._calculate_entropy  ss    < ,,% 
 $**1133A6xnnnn_mnnnooor4   c                     |                      |||          }|                    d                                          d         }t          j        fd|D                       S )av  
        Use the idf word weightingg method.

        (See: https://link.springer.com/article/10.1007/s40815-017-0327-9)

        Parameters
        ----------
            num_documents : int
                The number of documents in the corpus.
            vocabulary_size : int
                Number of unique words in the corpus.
            corpus : list of lists of str
                The input file used to initialize the model.
            word_to_index: dict {str : int}
                Maps each unique vocabulary word to a unique index number.

        Returns
        -------
            numpy.array : float
        r   c                 >    g | ]}t          j        |z            S rF   r   )rG   
word_countr   s     r2   r   z,FlsaModel._calculate_idf.<locals>.<listcomp>  s(    \\\!;<<\\\r4   _create_sparse_binary_dtmr   r   rz   r   )r0   r   r}   r   rq   binary_sparse_dtmsummed_wordss    `     r2   r   zFlsaModel._calculate_idf  sr    6 !::	  ),,Q//6688;x\\\\|\\\]]]r4   c                     |                      |           }|                    d                                          d         }t          j        d |D                       S )a  
        Use the normal word weightingg method.

        (See: https://link.springer.com/article/10.1007/s40815-017-0327-9)

        Parameters
        ----------
             sparse_local_term_weights : scipy.sparse.dok_matrix
                 A sparse matrix showing the frequency of each words in documents.

        Returns
        -------
            numpy.array : float
        r   c                 <    g | ]}d t          j        |          z  S r   )mathsqrt)rG   r   s     r2   r   z/FlsaModel._calculate_normal.<locals>.<listcomp>	  s'    TTTdi
334TTTr4   )r   r   r   rz   r   )r~   squared_dtmr   s      r2   r   zFlsaModel._calculate_normal  sY    $ 0889RSS"q))002215xTT|TTTUUUr4   c                     |                      |||          }|                    d                                          d         }t          j        fd|D                       S )az  
        Use the probidf word weightingg method.

        (See: https://link.springer.com/article/10.1007/s40815-017-0327-9)

        Parameters
        ----------
            num_documents : int
                The number of documents in the corpus.
            vocabulary_size : int
                Number of unique words in the corpus.
            corpus : list of lists of str
                The input file used to initialize the model.
            word_to_index: dict {str : int}
                Maps each unique vocabulary word to a unique index number.

        Returns
        -------
            numpy.array : float
        r   c                 D    g | ]}t          j        |z
  |z            S rF   r   )rG   binary_word_countr   s     r2   r   z0FlsaModel._calculate_probidf.<locals>.<listcomp>.  sE     L L L. -2C"CGX!XYY L L Lr4   r   )r0   r   r}   r   rq   r   summed_binary_words_lists    `     r2   r   zFlsaModel._calculate_probidf  s    6 !::	  $5#8#8#;#;#B#B#D#DQ#G x L L L L2JL L L M M 	Mr4   c                    t          | |ft          j                  }t          |           D ]}|                    |                                          d         }|                                d         }||dk             }	t          |	          D ]@\  }
}|||
                  }|||         z  }|t          j        |          z  ||||
         f<   A|S )aA  
        Create probability of word i in document j, multiplied by its base-2 logarithm.

        (See: https://link.springer.com/article/10.1007/s40815-017-0327-9)

        Parameters
        ----------
            num_documents : int
                The number of documents in the corpus.
            vocabulary_size : int
                Number of unique words in the corpus.
            sparse_local_term_weights : scipy.sparse.dok_matrix
                A sparse matrix showing the frequency of each words in documents.
            index_to_word : dict {int : str}
                Maps each unique index number to a unique vocabulary word.
            sum_words : collections.Counter {str : int}
                The count of each word in the corpus.

        Returns
        -------
             scipy.sparse.dok_matrix
        rx   r   )	r   rz   r{   rangegetrowtoarraynonzeror_   r   )r   r}   r~   rr   ru   r   re   
row_counts
word_indexnon_zero_row_countsrd   countrf   prob_ijs                 r2   r   zFlsaModel._create_p_log_p_ij1  s    <  O,BJ  
 }%% 	J 	JA299!<<DDFFqIJ#++--a0J",Z1_"=%&9:: J J5$Z]3)D/1/69I9I/I
1jm+,,J r4   c                     t          | |ft          j                  }t          |          D ]E\  }}t                              |d          }t          |          D ]}||         ||||         f<   F|S )a  
        Create a binary sparse document-term-matrix (used for idf and probidf).

        (See: https://link.springer.com/article/10.1007/s40815-017-0327-9)

        Parameters
        ----------
            num_documents : int
                The number of documents in the corpus.
            vocabulary_size : int
                Number of unique words in the corpus.
            corpus : list of lists of str
                The input file used to initialize the model.
            word_to_index: dict {str : int}
                Maps each unique vocabulary word to a unique index number.

       Returns
        -------
             scipy.sparse.dok_matrix
        rx   r>   )r   rz   r{   r_   rp   fromkeysrm   )	r   r}   r   rq   r   	doc_indexrv   binary_document_counterrf   s	            r2   r   z#FlsaModel._create_sparse_binary_dtm\  s    6 'O,BJ   $-V#4#4 	W 	WIx&*mmHa&@&@#H W W9PQU9V ")"/"5#6 7 7W ! r4   c                 l    t          ||          \  }}}| dv r|S | dv r|j        S t          dd          )a  
        Perform singular decomposition for dimensionality reduction.

        (See: https://web.mit.edu/be.400/www/SVD/Singular_Value_Decomposition.htm)
        For SVD on a sparse matrix, the sparsesvd package is used
        (https://pypi.org/project/sparsesvd/)

        Parameters
        ----------
             algorithm : str
                 Indicator for which algorithm is being trained ('flsa' or 'flsa-w').
             sparse_weighted_matrix : scipy.sparse.dok_matrix
                 Sparse global term matrix.
             svd_factors : int
                 The number of singular values to include.

        Returns
        -------
            numpy.array : float
        rV   rW   zInvalid algorithm selected.z1Only "flsa" ans "flsa-w" are currently supported.)r   Tr;   )r   sparse_weighted_matrixr   svd_urS   svd_vs         r2   _create_projected_dataz FlsaModel._create_projected_data  sb    4 " q%   	L
" 	7N6LN N 	Nr4   c                 f    t          j        ||           }|                    |          \  }}}|S )a7  
        Perform clustering on the projected data.

        The pyFUME package is used for clustering:
            (https://pyfume.readthedocs.io/en/latest/Clustering.html)

        Parameters
        ----------
             data: numpy.array
                 The output from self._create_projected_data().
             number_of_clusters : int
                 The number of clusters (topics).
             method : str
                 The cluster method, choose from: 'fcm', 'gk', 'fst-pso'.
        Returns
        -------
            numpy.array : float
        )nr_clusdata)method)r   	Clusterercluster)r   number_of_clustersr   	clustererrS   partition_matrixs         r2   _create_partition_matrixz"FlsaModel._create_partition_matrix  sG    0 (&  	 "+!2!2&!2!A!AQr4   c                    t          j        d |                     d                                          D                       }t          |                     d                                          d                   }||z  S )aK  
        Get the probability of document j.

        Parameters
        ----------
            sparse_matrix : scipy.sparse.dok_matrix
                A sparse matrix representation of the global term weights.
        Returns
        -------
            numpy.array : float
                (shape: number of documents x 1)
        c                     g | ]
}|d          S )r   rF   )rG   rB   s     r2   r   z5FlsaModel._create_prob_document_j.<locals>.<listcomp>  s     Q Q QCQ Q Q Qr4   r>   r   rz   r   r   r   )sparse_matrixdocument_sumtotal_sum_ds      r2   _create_prob_document_jz!FlsaModel._create_prob_document_j  ss      x Q Q=3D3DQ3G3G3N3N3P3P Q Q QRR-++A..5577:;;k))r4   c                     t          j        |                     d                                                    }t          |                     d                                          d                   }||z  d         S )aD  
        Get the probability of word i.

        Parameters
        ----------
            sparse_matrix : scipy.sparse.dok_matrix
                A sparse matrix representation of the global term weights.

        Returns
        -------
            numpy.array : float
                (shape: vocabulary_size x 1)
        r   r   )r   word_sumtotal_sum_ws      r2   _create_prob_word_izFlsaModel._create_prob_word_i  si     8M--a007799::-++A..5577:;;;&**r4   c                 6    t          j        | j        |          S )a  
        Get the probability of topic k.

        Parameters
        ----------
            prob_topic_given_word_transpose : numpy.array : float
                The output from self._create_partition_matrix().
            prob_word_i : numpy.array : float
                The output from self._create_prob_word_i().

        Returns
        -------
            numpy.array : float
                (shape: 1 x number of topics)
        )rz   matmulr   )prob_topic_given_word_transposeprob_word_is     r2   _create_prob_topic_kzFlsaModel._create_prob_topic_k  s    ( y8:KHHHr4   c                    | dv r%|t          dd          |t          d          dS | dv r&|t          dd          |t          dd	          dS | d
v r%|t          dd          |t          d          dS t          d          )zR
        Check whether the algorithms are being fed the right attributes.
        r   NzPlease feed the methodz1'prob_topic_given_document_transpose' to run flsaz9Please feed the method 'global_term_weights', to run flsar   z/'prob_topic_given_word_transpose' to run flsa-wz,Please feed the method 'global_term_weights'z to run flsa-wr^   z.'prob_topic_given_word_transpose' to run modelz9Please feed the method 'local_term_weights', to run modelz)Your algorithm is currently not supported)r;   )r   #prob_topic_given_document_transposer   local_term_weightsr   s        r2   _check_passed_variablesz!FlsaModel._check_passed_variables  s&      	J2 V !9!TV V V" ^ !\]]]^ ^*$ 	J. T !9!RT T T" 3 !O!13 3 33 3    
	J / S !9!QS S S! ^ !\]]]^ ^ HIIIr4   c                    |                      |||||           |dv rY|                     |          | _        |                     |          | _        |dv r |                     || j                  | _        nX|dv rT|                     |          | _        |                     |          | _        |                     || j                  | _        |dv r|j        | j        z  j        }||                    d          z  }t          j
        ||                    d          z            | _        t          j        | j        j        |          | _        |j        }| j        |fS |dv r|j        | j        z  j        }	|	|	                    d          z  | _        |dv r5t          j
        ||                    d          z            j        | _        n8|dv r4t          j
        ||                    d          z            j        | _        | j        | j        z  j        t          j        | j                  z  }
t          j        |
| j                  }|| j        z  j        | j        z  }| j        |fS t          d	          )
a  
        Method that performs matrix multiplications to obtain the output matrices.

        The 'algorithm' parameter is generic and the other ones depend on the selected algorithm.
        The other parameters passed into this method depend on the used algorithm.

        Parameters
        ----------
            algorithm : str
                 Indicator for which algorithm is being trained ('flsa' or 'flsa-w').
            global_term_weights : scipy.sparse.dok_matrix
                The output from self._create_partition_matrix().
            prob_topic_given_document_transpose : numpy.array : float
                The output from self._create_partition_matrix() (flsa)
            prob_topic_given_word_transpose : numpy.array : float
                 (flsa-w)

        Returns
        -------
            numpy.array : float
                The prbability of a word given a topic.
            numpy.array : float
                The prbability of a topic given a document.
        rU   r   r   r   r   )axisr>   )rW   r^   z"algorithm" is unknown.)r   r   r&   r   r'   r   r(   r   r   rz   asarrayr*   r   r)   r   r;   )r0   r   r   r   r   r   prob_document_and_topicprob_document_given_topicprob_topic_given_documentprob_word_and_topicprob_document_given_words              r2   _create_probability_matricesz&FlsaModel._create_probability_matrices(  s   B 	$$/+	 	 	    	 !% 8 89L M MD$($@$@AT$U$UD!J& %)%>%>3%& &"    	 !% 8 89K L LD$($@$@AS$T$TD!!%!:!:/1B" "D   	J'J'LtOd'd&g#(?BYB]B]cdB]BeBe(e%-/Z8KNaNeNefgNhNh8h-i-iD**,).0)+ +D' )L(M%.0III   	J $C#DtGX#X"[*=@S@W@W]^@W@_@_*_D'J& n13<OReRiRijkRlRl<l1m1m1o.. ! ! n 24<NQcQgQghiQjQj<j1k1k1m.)-)G$J_)_(b*,(43D*E*E)F$(*	(+) )% +DdFX*X)[151F*G%.0III2333r4   c                 \   t          | t                    s| dk    rt          d          g }t          |j        d                   D ]d}t                      }|dd|f                                         |  d         }|D ]}|||f         |||         <   |                    |           e|S )a(  
        Creates a list with dictionaries of word probabilities
        per topic based on the top-n words.

        Parameters
        ----------
             topn : int
                The top-n words to include
                (needs only to be used when 'method=topn').
             prob_word_given_topic : numpy.array : float
                Matrix that gives the probability of a word given a topic.
             index_to_word : dict {int : str}
                Maps each unique index number to a unique vocabulary word.

        Returns
        -------
             list of dicts {int : float}
                Keys: all the indices of words from prob_word_given_topic
                who's weight's are amongst the top percentage.
                Values: the probability associated to a word.
        r   z+Please choose a positive integer for 'topn'r>   N)r7   rA   r;   r   shaperp   argsortrO   )topnprob_word_given_topicrr   top_dictionariestopic_indexnew_dicthighest_weight_indicesr   s           r2   _create_dictlist_topnzFlsaModel._create_dictlist_topn  s    6 $$$ 	L 	LJKKK !6!<Q!?@@ 	. 	.KvvH%:111k>%J%R%R%T%TVZUZU[U[%\"4  
6K,7z233 ##H----r4   c                    t          | t                    sd| cxk    rdk    rn nt          d          g }t          |j        d                   D ]}t                      }d}d}t          j        |dd|f                   ddd         }t          j        |dd|f                   ddd         }	|| k     r-||         |||	|                  <   |||         z  }|dz  }|| k     -|	                    |           |S )a  
        Create a list with dictionaries of word probabilities
        per topic based on the percentile.
         - Keys: all the indices of words from prob_word_given_topic
             who's weight's are amongst the top percentage.
         - Values: the probability associated to a word.

        Parameters
        ----------
             perc : float
                The top percentile words to include
                (needs only to be used when 'method=percentile').
             prob_word_given_topic : numpy.array : float
                Matrix that gives the probability of a word given a topic.
             index_to_word : dict {int : str}
                Maps each unique index number to a unique vocabulary word.

        Returns
        -------
             list of dicts {int : float}
                Keys: all the indices of words from prob_word_given_topic
                    who's weight's are amongst the top percentage.
                Values: the probability associated to a word.
        r   r>   z1Please choose a number between 0 and 1 for 'perc'N)
r7   floatr;   r   r   rp   rz   sortr   rO   )
percr   rr   top_listtopr   r   rd   weightsword_indicess
             r2   _create_dictlist_percentilez%FlsaModel._create_dictlist_percentile  sN   < $&& 	R1 	R 	R 	R 	R 	R 	R 	R 	R 	RPQQQ.4Q788 
	& 
	&CvvHEAg3AAAsF;<<TTrTBG:&;AAAsF&CDDTTrTJL$, ;B1:|A78#Q $,  OOH%%%%r4   Tr   c           
         || j         }|dk     r| j        }|| j        }t          |t          j                  st          d          t          |t                    st          d          t          |t                    r|dk    rt          d          |j	        d         |j	        d         k     rt          dd          g d	}|j	        d         t          |                                          k    r't          j        d
                    |                     t          |t                     st          d          g }|st#          |j	        d                   D ]}d}|dd|f                                         | d         ddd         }	|	D ]9}
|t'          t)          ||
|f         d                    dz   ||
         z   dz   z  }:|                    ||dd         f           |S t#          |j	        d                   D ]i}g }|dd|f                                         | d         ddd         }	|	D ]}
|                    ||
                    |                    |           j|S )a  
        Get a representation for the topics.

        Parameters
        ----------
            formatted : bool
                Whether the topic representations should be formatted as strings.
                If False, they are returned as 2 tuples of (word, probability).
            prob_word_given_topic : numpy.array : float
                Matrix that gives the probability of a word given a topic.
            num_words : int
                 Indicates how many words per topic should be shown.
            index_to_word : dict {int : str}
                Maps each unique index number to a unique vocabulary word.

        Returns
        -------
            list of tuples (int, str)
                The produced topics.
        Nr   zAPlease feed the algorithm 'prob_word_given_topic' as a np.ndarrayz3Please feed the algorithm 'index_to_word' as a dictz*Please use a positive int for 'num_words'.r>   z3'prob_word_given_topic' has more columns then rows,z) probably you need to take the transpose.)z8It seems like 'prob_word_given_topic' and 'index_to_wordz2are not aligned. The number of vocabulary words inz)'prob_word_given_topic' deviate from the z#number of words in 'index_to_word'.r\   z'Please choose a boolean for "formatted" r      z*"z" + )r)   r   r#   r7   rz   ndarrayr`   rp   rA   r   r;   rb   r|   warningswarnrc   boolr   r   ra   roundrO   )r0   	formattedr   r   rr   rg   
topic_listr   weight_wordssorted_highest_weight_indicesr   	word_lists               r2   show_topicszFlsaModel.show_topics  s   6 ! 	@$($?!q= 	'I 	0 /M/<< 	a_```-.. 	SQRRR)S)) 	JY!^ 	JHIII &q),A,G,JJ 	JRHJ J J: : : !&q)S1C1C1E1E-F-FF 	-M#((7++,,,)T** 	HFGGG
 	$%:%@%CDD D D!0Eaaan0U0]0]0_0_aj`j`k`k0lmqmqoqmq0r-"? R RJ S/DZQ\E\/]_`)a)a%b%b'+&,.;J.G&HJP&Q RLL!!;SbS0A"BCCCC$%:%@%CDD - -	0Eaaan0U0]0]0_0_aj`j`k`k0lmqmqoqmq0r-"? @ @J$$]:%>????!!),,,,r4   r   皙?c           
         |                                   || j        }g }|dvrt          |dd          |dk    r|                     ||| j                  }n|                     ||| j                  }|D ]t}dg|j        d         z  }	|D ]I}
t          |j        d                   D ],}|	|xx         ||                             |
d          z  cc<   -J|	                    |	           ut          j        |          S )a   
        Create a topic embedding for each input document,
        to be used as input to predictive models.

        Parameters
        ----------
            corpus : list of lists of str
                The input file used to initialize the model.
            prob_word_given_topic : numpy.array : float
                Matrix that gives the probability of a word given a topic.
            method : str
                Method to select words to be included in the embedding.
                (choose from 'topn', 'percentile'):
                    - topn: for each topic the top n words with the highest
                        probability are included.
                    - percentile: for each topic all words with highest
                        probabilities are assigned while the cumulative
                        probability is lower than the percentile.
            topn : int
                The top-n words to include
                (needs only to be used when 'method=topn').
            perc: float
                The benchmark percentile until which words need to be added
                (between 0 and 1).

        Returns
        -------
            numpy.array : float
                Array in which each row gives the topic embedding for
                the associated document.
        N)r   
percentilez#is not a valid option for 'method'.z% Choose either 'topn' or 'percentile'r   r   r>   )r   r)   r;   r   r#   r   r   r   getrO   rz   r   )r0   r   r   r   r   r   top_distdictlistrB   topic_weightsrf   rd   s               r2   get_topic_embeddingzFlsaModel.get_topic_embedding#  s`   N 	  	@$($?!// 	FV%JDF F FV 	11+T-@ HH 77+T-@ H  	+ 	+CC"7"=a"@@M A A4:1=>> A AA!!$$$a(@(@@$$$$AOOM****x!!!r4   c_vc                    ||| j         }|                     d          }t          j        |          fd|D             }t	          ||||t          |d                                                             | _        | j        S )a\  
        Calculate the coherence score for the generated topic.

        Parameters
        ----------
             corpus : list of lists of str
                The input file used to initialize the model.
             topics : list of lists of str
                 The words per topics,
                 equivalent to self.show_topics(formatted=True).
             coherence : str
                 The type of coherence to be calculated.
                 Choose from: 'u_mass', 'c_v', 'c_uci', 'c_npmi'.

        Returns
        -------
             float
                 The coherence score.
        NTr  c                 :    g | ]}                     |          S rF   )doc2bow)rG   textr1   s     r2   r   z1FlsaModel.get_coherence_score.<locals>.<listcomp>  s%    ???good++???r4   r   )topicstextsr   r9   	coherencer   )r   r	  r8   r:   r   rb   get_coherencer+   )r0   r   r  r  
corpus_bowr1   s        @r2   get_coherence_scorezFlsaModel.get_coherence_scorea  s    2  	6f 	6[F%%%55F$V,,???????
-VAY      moo 	 ##r4   c                     ||                      d          }t                      }d}|D ])}|                    |           |t          |          z  }*t          |          |z  | _        | j        S )a  ''
        Calculate the diversity score for the generated topic.

        Diversity = number of unique words / number of total words.
        See: https://tinyurl.com/2bs84zd8

        Parameters
        ----------
             topics : list of lists of str
                 The words per topics,
                 equivalent to self.show_topics(formatted=True).

        Returns
        -------
             float
                 The diversity score.
        NTr  r   )r	  rm   rt   rb   r,   )r0   r  unique_wordstotal_wordsr   s        r2   get_diversity_scorezFlsaModel.get_diversity_score  s    *  	6%%%55Fuu 	$ 	$C$$$3s88#KK"<00;>##r4   c                     ||| j         }|                     d          }| j        |                     |||          | _        | j        |                     |          | _        | j        | j        z  S )a  ''
        Calculate the interpretability score for the generated topics.

        Interpretability = coherence * diversity.
        (see: https://tinyurl.com/2bs84zd8)

        Parameters
        ----------
             corpus : list of lists of str
                The input file used to initialize the model.
             topics : list of lists of str
                 The words per topics, equivalent to
                 self.show_topics(formatted=True).
             coherence : str
                 The type of coherence to be calculated.
                 Choose from: 'u_mass', 'c_v', 'c_uci', 'c_npmi'.

        Returns
        -------
             float
                 The interpretability score.
        NTr  )r   r	  r+   r  r,   r!  )r0   r   r  r  s       r2   get_interpretability_scorez$FlsaModel.get_interpretability_score  s    8  	6f 	6[F%%%55F 	#'#;#;$ $D 
  	D#'#;#;F#C#CD #d&:::r4   c                     | j         S )ap  
        Returns a set of all the words in the corpus

        Example:
        After initializing an instance of the flsamodel models as 'model'

        corpus = [['this', 'is', 'the', 'first', 'file'],
             ['and', 'this', 'is', 'second', 'file']]

        model.get_vocabulary()

        >>> {'this', 'is', 'the', 'first', 'file', 'and', 'second'}
        )r   r0   s    r2   get_vocabularyzFlsaModel.get_vocabulary  s     r4   c                     | j         S )z
        Get the term-topic matrix.

        Returns
        -------
        numpy.ndarray
            The probability for each word in each topic,
            shape (num_topics, vocabulary_size).
        )r.   r%  s    r2   
get_topicszFlsaModel.get_topics  s     yr4   c                     | j         S )aD  
        Returns the number of words in the vocabulary

        Example:
            After initializing an instance of the flsamodel models as 'model'

        corpus = [['this', 'is', 'the', 'first', 'file'],
             ['and', 'this', 'is', 'second', 'file']]

        model.get_vocabulary_size()

        >>> 7
        )r    r%  s    r2   get_vocabulary_sizezFlsaModel.get_vocabulary_size  s     $$r4   c                     | j         S )z
        Obtain a dictionary that maps each vocabulary word to an index.

        Returns
        -------
        dict of {str : int}
            word to int mapping.
        )r"   r%  s    r2   get_word_to_indexzFlsaModel.get_word_to_index       ""r4   c                     | j         S )z
        Obtain a dictionary that maps index numbers to vocabulary words.

        Returns
        -------
        dict of {int : str}
            int to word mapping.
        )r#   r%  s    r2   get_index_to_wordzFlsaModel.get_index_to_word	  r-  r4   c                     | j         S )z
        Return the input file.

        Returns
        -------
            list of list of str
                The input file 'corpus'.
        )r   r%  s    r2   
get_corpuszFlsaModel.get_corpus  s     {r4   c                     | j         S )z
        Return the probabilities per word.

        Returns
        -------
            np.array of float
                The probabilities per word.
        )r&   r%  s    r2   get_prob_word_izFlsaModel.get_prob_word_i  s       r4   c                     | j         S )z
        Return the probabilities per document.

        Returns
        -------
            np.array of float
                The probabilities per document.
        )r'   r%  s    r2   get_prob_document_jzFlsaModel.get_prob_document_j*  s     $$r4   c                     | j         S )z
        Return the probabilities per topic.

        Returns
        -------
            np.array of float
                The probabilities per topic.
        )r(   r%  s    r2   get_prob_topic_kzFlsaModel.get_prob_topic_k5  s     !!r4   c                 d   t          |t                    st          d          |                    d          rt	          |d          }n<|                    d          rt	          |dz   d          }nt	          |dz   d          }t          j        | |           |                                 dS )ah  ''
        Saves the object to the drive, using the pickle library.

        Parameters
        ----------
             filepath : str
                The directory in which the file should be stored,
                either with or without the file name.

        Returns
        -------
             float
                 The interpretability score.
        (Make sure that "filepath" has type "str".picklewb/model.pickleN)r7   ra   r;   endswithopenpickledumpclose)r0   filepath
pickle_outs      r2   savezFlsaModel.save@  s    $ (C(( 	IGHHHY'' 	:h--JJs## 	:h7>>JJh2D99JD*%%%r4   c                 <   t          |t                    st          d          |                    d          s |                    d          r|dz  }n|dz  }t	          |d          }t          j        |          j        | _        |                                 dS )aj  ''
        Loads the object from the drive, using the pickle library.

        Parameters
        ----------
             filepath : str
                The directory in which the file should be stored,
                either with or without the file name.

        Returns
        -------
             float
                 The interpretability score.
        r9  r:  r<  r=  z/model.picklerbN)	r7   ra   r;   r>  r?  r@  load__dict__rB  )r0   rC  infiles      r2   rH  zFlsaModel.load]  s    $ (C(( 	IGHHH  ++ 	,  %% ,N*O+h%%F++4r4   )	r   r   r   r   NNNNN)NNNNN)r   )NNNN)TNr   N)Nr   r   r
  )NNr  r   ).__name__
__module____qualname____doc__r3   r   staticmethodr6   r<   r   r   r!   r$   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	  r  r  r!  r#  r&  r(  r*  r,  r/  r1  r3  r5  r7  rE  rH  rF   r4   r2   r
   r
      s2        D # &4 &4 &4 &4P  <   \8   \>4C 4C 4Cl + + \+" , , \,2   \& &) &) \&)X !&*QO QO QO QOf&p &p &pP"^ "^ "^H V V \V*$M $M $ML ( ( \(T "! "! \"!H "N "N \"NH        \ < * * \*( + + \+& I I \I* !J !J \!JL 15,0# $]4 ]4 ]4 ]4~ %  %  \% N + + \+^ "&D D D DR #'<" <" <" <"@ 	'$ '$ '$ '$V $ $ $ $B 	'; '; '; ';R      
 
 
% % % 	# 	# 	#	# 	# 	#	 	 		! 	! 	!	% 	% 	%	" 	" 	"  :    r4   r
   c                   2     e Zd ZdZ	 	 	 	 d fd	Zd Z xZS )	Flsaa5  
    Class to run the FLSA algorithm (see: https://tinyurl.com/mskjaeuu).

    Parameters
        ----------
            corpus : list of lists of str
                The input file used to initialize the model.
            num_topics : int
                The number of topics that the model should train.
            num_words : int
                 Indicates how many words per topic should be shown.
            word_weighting : str
                Indicates the method used for word_weighting. Choose from:
                  - entropy
                  - normal
                  - idf
                  - probidf
            svd_factors : int
                 The number of singular values to include.
            cluster_method : str
                 The cluster algorithm to be used ('fcm', 'gk', 'fst-pso').
    
   r   r   r   c           	      X    t                                          d||||||           d S )NrV   r   r   r   r   r   r   r   superr3   r0   r   r   r   r   r   r   	__class__s          r2   r3   zFlsa.__init__  sE     	!))# 	 	 	 	 	 	r4   c           	      p   |                      | j        | j        | j                  }|                     | j        | j        | j        || j        | j        | j                  }|                     d|| j	                  }| 
                    || j        | j                  }|                     d||          S )*  
        Method to obtain the matrices after the model has been initialized.

        Returns
        -------
            numpy.array : float
                The prbability of a word given a topic.
            numpy.array : float
                The prbability of a topic given a document.
        r   r   r}   r~   rr   rq   ru   rV   r   r   r   r   r   r   )r   r   r   r   r   r    r"   r   r   r#   r%   r   r   r   r   r   r   r0   sparse_document_term_matrixsparse_global_term_weightingprojected_datar   s        r2   r-   zFlsa._get_matrices  s     '+&L&LK!' '#
 (,'N'N;. 1&A--o (O ( ($ 44#?( 5  
  88#& 9  
 000@ < 1   	r4   rR  r   r   r   rK  rL  rM  rN  r3   r-   __classcell__rX  s   @r2   rQ  rQ  {  sd         4 #      &' ' ' ' ' ' 'r4   rQ  c                   2     e Zd ZdZ	 	 	 	 d fd	Zd Z xZS )	FlsaWaU  
    Class to train the FLSA-W algorithm.

    See: https://ieeexplore.ieee.org/abstract/document/9660139

    Parameters
        ----------
            corpus : list of lists of str
                The input file used to initialize the model.
            num_topics : int
                The number of topics that the model should train.
            num_words : int
                 Indicates how many words per topic should be shown.
            word_weighting : str
                Indicates the method used for word_weighting. Choose from:
                  - entropy
                  - normal
                  - idf
                  - probidf
            svd_factors : int
                 The number of singular values to include.
            cluster_method : str
                 The cluster algorithm to be used ('fcm', 'gk', 'fst-pso').
    rR  r   r   r   c           	      X    t                                          d||||||           d S )NrW   rT  rU  rW  s          r2   r3   zFlsaW.__init__  sE     	!))# 	 	 	 	 	 	r4   c           	      p   |                      | j        | j        | j                  }|                     | j        | j        | j        || j        | j        | j                  }|                     d|| j	                  }| 
                    || j        | j                  }|                     d||          S )rZ  r[  rW   r\  r]  )r   r   r   r^  r_  s        r2   r-   zFlsaW._get_matrices  s     '+&L&LK!' '#
 (,'N'N;. 1&A--o (O ( ($ 44#?( 5  
  88#& 9  
 00,< < 1   	r4   rc  rd  rf  s   @r2   rh  rh    sd         8 #      (' ' ' ' ' ' 'r4   rh  c                   <     e Zd ZdZ	 	 	 	 	 	 d fd	Zd	 Zd
 Z xZS )FlsaEa  
    Class to train the FLSA-E algorithm. See: https://tinyurl.com/5n8utppk

    Parameters
        ----------
            corpus : list of lists of str
                The input file used to initialize the model.
            num_topics : int
                The number of topics that the model should train.
            num_words : int
                Indicates how many words per topic should be shown.
            cluster_method : str
                The cluster algorithm to be used ('fcm', 'gk', 'fst-pso').
            min_count : int
                Ignores all words with total frequency lower than this.
            window : int
                Maximum distance between the current and predicted word within a sentence.
            vector_size : int
                Dimensionality of the word vectors.
            workers : int
                Use these many worker threads to train the model
                ( = faster training with multicore machines).
    rR  r   r>      r   r   c	                 x    d| _         d| _        t                                          d||||||||	  	         d S )N.r^   )	r   r   r   r   r   r   r   r   r   )modelword_embeddingrV  r3   )
r0   r   r   r   r   r   r   r   r   rX  s
            r2   r3   zFlsaE.__init__?  sZ     
!!)# 	 
	 
	 
	 
	 
	r4   c                 V    t          |||||          | _        | j        j        j        S )a  
            Method to train a word embedding on the corpus.

            Parameters
                ----------
                    data : list of lists of str
                        The input file used to initialize the model.
                    min_count : int
                        Ignores all words with total frequency lower than this.
                    window : int
                        Maximum distance between the current and predicted word within a sentence.
                    vector_size : int
                        Dimensionality of the word vectors.
                    workers : int
                        Use these many worker threads to train the model
                        ( = faster training with multicore machines).
        )	sentencesr   r   r   r   )r   ro  wvvectors)r0   r   r   r   r   r   s         r2   get_word_embeddingzFlsaE.get_word_embeddingZ  s8    4 #  
 z}$$r4   c                 <   |                      | j        | j        | j                  }|                     | j        | j        | j        | j        | j                  | _	        | 
                    | j	        | j        | j                  }|                     d||          S )z
        Method to run after the FLSA_E class has been initialized to obtain the output matrices.

        Returns:
                  - Numpy array: prob_word_given_topic
                  - Numpy array: prob_topic_given_document
        )r   r   r   r   r   r]  r^   )r   r   r   )r   r   r    r"   ru  r   r   r   r   rp  r   r   r   r   )r0   r`  r   s      r2   r-   zFlsaE._get_matrices~  s     '+&L&LK!' '# #55n(;L 6    88$#& 9   00,<: 1   	r4   )rR  r   r>   rm  r   r   )rK  rL  rM  rN  r3   ru  r-   re  rf  s   @r2   rl  rl  &  sz         8       6"% "% "%H" " " " " " "r4   rl  )rN  r   collectionsr   r   r@  rM   numpyrz   scipy.sparse.linalgr   scipy.sparser   pyfumer   gensim.corporar8   gensim.models.coherencemodelr   gensim.modelsr   r
   rQ  rh  rl  rF   r4   r2   <module>r     s                     $ $ $ $ $ $ # # # # # #                   7 7 7 7 7 7 " " " " " "b b b b b b b bJ+Q Q Q Q Q9 Q Q QhT T T T TI T T Tnz z z z zI z z z z zr4   