
    cA                        d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlZddlmZ ej        d         dk    rddlZddlmZ nddlmZ ddlmZ ej                            d          Zej                            de          Z	 ej                            ej                            ed	                    ZeZ ej         e!          Z"d
Z#dZ$d1dZ%d Z&d Z'e#dfdZ(d2dZ)d3dZ*d Z+d Z,d Z-d4dZ.e!dk    rA ej/        dej0        ej1                    ej2        dd          Z3e34                                Z5e56                    d d!d"dd#$           dZ7e56                    d%d&d"d'e7d()           e38                                Z9e9j:        0 e.e9j:        d         d*          Z;e")                    d+e;           dS e9j)        we9j)        d,k    r' e< ej=         e)d-          d./                     dS e9j)        e7k    r
 e)            n e)e9j)        0          Z> e< ej=        e>d./                     dS dS dS )5a'  
This module is an API for downloading, getting information and loading datasets/models.

See `RaRe-Technologies/gensim-data <https://github.com/RaRe-Technologies/gensim-data>`_ repo
for more information about models/datasets/how-to-add-new/etc.

Give information about available models/datasets:

.. sourcecode:: pycon

    >>> import gensim.downloader as api
    >>>
    >>> api.info()  # return dict with info about available models/datasets
    >>> api.info("text8")  # return dict with info about "text8" dataset


Model example:

.. sourcecode:: pycon

    >>> import gensim.downloader as api
    >>>
    >>> model = api.load("glove-twitter-25")  # load glove vectors
    >>> model.most_similar("cat")  # show words that similar to word 'cat'


Dataset example:

.. sourcecode:: pycon

    >>> import gensim.downloader as api
    >>> from gensim.models import Word2Vec
    >>>
    >>> dataset = api.load("text8")  # load dataset as iterable
    >>> model = Word2Vec(dataset)  # train w2v model


Also, this API available via CLI::

    python -m gensim.downloader --info <dataname> # same as api.info(dataname)
    python -m gensim.downloader --info name # same as api.info(name_only=True)
    python -m gensim.downloader --download <dataname> # same as api.load(dataname, return_path=True)

You may specify the local subdirectory for saving gensim data using the
GENSIM_DATA_DIR environment variable.  For example:

    $ export GENSIM_DATA_DIR=/tmp/gensim-data
    $ python -m gensim.downloader --download <dataname>

By default, this subdirectory is ~/gensim-data.

    )absolute_importN)partial   )urlopenz~/gensim-dataGENSIM_DATA_DIRz..zPhttps://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list.jsonzBhttps://github.com/RaRe-Technologies/gensim-data/releases/download   c                    d}t          | |z            }t          t          j        ||z  |z                      }t	          |dz  |z  d          }d|z  d||z
  z  z   }	|dk    r}t
          j                            d|	d|dd	t	          |d
z  d          dt	          t          |          d
z  d          d
           t
          j                                         dS t
          j                            d|dz   d|d|	d|dd	t	          |d
z  d          dt	          t          |          d
z  d          d           t
          j                                         dS )a%  Reporthook for :func:`urllib.urlretrieve`, code from [1]_.

    Parameters
    ----------
    chunks_downloaded : int
        Number of chunks of data that have been downloaded.
    chunk_size : int
        Size of each chunk of data.
    total_size : int
        Total size of the dataset/model.
    part : int, optional
        Number of current part, used only if `no_parts` > 1.
    total_parts : int, optional
        Total number of parts.


    References
    ----------
    [1] https://gist.github.com/vladignatyev/06860ec2040cb497f0f3

    2   d   r   =-z[z] % i   /zMB downloadedz Part z [N)	floatintmathfloorroundsysstdoutwriteflush)
chunks_downloaded
chunk_size
total_sizeparttotal_partsbar_lensize_downloaded
filled_lenpercent_downloadedbars
             1lib/python3.11/site-packages/gensim/downloader.py	_progressr%   _   s   , G-
:;;OTZ?!:j HIIJJJ3!6* DqII


SGj$89
9Ca 
''o5q9999eJ'';7;;;;=	
 	
 	
 	

q+++sss,>,>o5q9999eJ'';7;;;;=	
 	
 	
 	
    c                     t           j                            t                    s	 t                              dt                     t          j        t                     dS # t          $ rh} | j        t          j	        k    r't          d                    t                              t          d                    t                              d} ~ ww xY wdS )a  Create the gensim-data directory in home directory, if it has not been already created.

    Raises
    ------
    Exception
        An exception is raised when read/write permissions are not available or a file named gensim-data
        already exists in the home directory.

    zCreating %sz^Not able to create folder gensim-data in {}. File gensim-data exists in the directory already.z{Can't create {}. Make sure you have the read/write permissions to the directory or you can try creating the folder manuallyN)ospathisdirBASE_DIRloggerinfomakedirsOSErrorerrnoEEXIST	Exceptionformat_PARENT_DIR)es    r$   _create_base_dirr6      s     7=="" 	KKx000K!!!!! 	 	 	w%,& 
77=vk7J7J  
  SVH%%  		 s   9A! !
C+A#CCc                     t          j                    }t          | d          5 t          fdd          D ]}|                    |           	 ddd           n# 1 swxY w Y   |                                S )zCalculate the checksum of the file, exactly same as md5-sum linux util.

    Parameters
    ----------
    fname : str
        Path to the file.

    Returns
    -------
    str
        MD5-hash of file names as `fname`.

    rbc                  .                          d          S )Ni   )read)fs   r$   <lambda>z)_calculate_md5_checksum.<locals>.<lambda>   s    !&&,, r&   r&   N)hashlibmd5openiterupdate	hexdigest)fnamehash_md5chunkr;   s      @r$   _calculate_md5_checksumrF      s     {}}H	eT		 #a....44 	# 	#EOOE""""	## # # # # # # # # # # # # # # s   ,AA"%A"zutf-8c                 Z   t           j                            t          d          }t	                       	 t          |                                           }t          |d          5 }|                    |           ddd           n# 1 swxY w Y   n3# t          t          f$ r t                              d| |           Y nw xY w	 t          j        |d|          5 }t          j        |          cddd           S # 1 swxY w Y   dS # t          $ r t!          d|z            w xY w)zLoad dataset information from the network.

    If the network access fails, fall back to a local cache.  This cache gets
    updated each time a network request _succeeds_.
    zinformation.jsonwbNzlcaught non-fatal exception while trying to update gensim-data cache from %r; using local cache at %r insteadr)encodingzPunable to read local cache %r during fallback, connect to the Internet and retry)r(   r)   joinr+   r6   r   r:   r?   r   r/   IOErrorr,   	exceptioniojsonload
ValueError)urlrJ   
cache_path
info_bytesfoutfins         r$   
_load_inforW      s    h(:;;J#S\\&&((
 *d## 	#tJJz"""	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# W 

 

 

 	./2J	
 	
 	
 	
 	




 WZx888 	"C9S>>	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 
 
 
02<=
 
 	

sS   !B &BBB-CCD D 3D  DD DD D*TFc                    t                      }| C|d         }|d         }| |v r|d         |          S | |v r|d         |          S t          d          |s|S |r=t          |d                                                   t          |d                   dS d |d                                         D             d |d                                         D             dS )a7  Provide the information related to model/dataset.

    Parameters
    ----------
    name : str, optional
        Name of model/dataset.  If not set - shows all available data.
    show_only_latest : bool, optional
        If storage contains different versions for one data/model, this flag allow to hide outdated versions.
        Affects only if `name` is None.
    name_only : bool, optional
        If True, will return only the names of available models and corpora.

    Returns
    -------
    dict
        Detailed information about one or all models/datasets.
        If name is specified, return full information about concrete dataset/model,
        otherwise, return information about all available datasets/models.

    Raises
    ------
    Exception
        If name that has been passed is incorrect.

    Examples
    --------
    .. sourcecode:: pycon

        >>> import gensim.downloader as api
        >>> api.info("text8")  # retrieve information about text8 dataset
        {u'checksum': u'68799af40b6bda07dfa47a32612e5364',
         u'description': u'Cleaned small sample from wikipedia',
         u'file_name': u'text8.gz',
         u'parts': 1,
         u'source': u'http://mattmahoney.net/dc/text8.zip'}
        >>>
        >>> api.info()  # retrieve information about all available datasets and models

    NcorporamodelsIncorrect model/corpus name)rY   rZ   c                 F    i | ]\  }}|                     d d          ||S latestTget.0namedatas      r$   
<dictcomp>zinfo.<locals>.<dictcomp>  s6    nnn<D$UYU]U]^fhlUmUmnD$nnnr&   c                 F    i | ]\  }}|                     d d          ||S r]   r_   ra   s      r$   re   zinfo.<locals>.<dictcomp>   s6    lll,4SWS[S[\dfjSkSkl4lllr&   )rW   rQ   listkeysitems)rc   show_only_latest	name_onlyinformationrY   rZ   s         r$   r-   r-      s   P ,,K <i(X&7? 	<y)$//V^ 	<x(..:;;;  gI 6 ; ; = =>>${[cOdJeJefff on;y3I3O3O3Q3Qnnnll+h2G2M2M2O2Olll  r&   c                 V   t                      }|d         }|d         }|2| |v r|d         |          d         S | |v r|d         |          d         S dS | |v r'|d         |          d                    |                   S | |v r'|d         |          d                    |                   S dS )a0  Retrieve the checksum of the model/dataset from gensim-data repository.

    Parameters
    ----------
    name : str
        Dataset/model name.
    part : int, optional
        Number of part (for multipart data only).

    Returns
    -------
    str
        Retrieved checksum of dataset/model.

    rY   rZ   Nchecksumzchecksum-{})r-   r3   )rc   r   rl   rY   rZ   s        r$   _get_checksumro   $  s      &&K)$G"F 	K7? 	;y)$/
;;V^ 	;x(.z::	; 	; 7? 	Ky)$/0D0DT0J0JKKV^ 	Kx(.}/C/CD/I/IJJ	K 	Kr&   c                     t                      }|d         }|d         }| |v r|d         |          d         S | |v r|d         |          d         S dS )zRetrieve the number of parts in which dataset/model has been split.

    Parameters
    ----------
    name: str
        Dataset/model name.

    Returns
    -------
    int
        Number of parts in which dataset/model has been split.

    rY   rZ   partsNr-   rc   rl   rY   rZ   s       r$   
_get_partsrt   C  sm     &&K)$G"Fw 49%d+G44	 48$T*7334 4r&   c           
      z   d                     t          |           }t          j                            t
          |           }|dz   }t          j                    }t          j                            |d          }t          j	        ||           t          |           }|dk    r/d                     |           }t          j                            ||          }t          d|          D ]#}	d	                     t          | |	
          }
d                     | |	          }t          j                            ||          }t          j	        |
|t          t          |	|                     t          |          t          | |	          k    r^t           j                            d           t           j                                         t(                              d|	dz   |           t-          j        |           t1          d          t3          |d          5 }t          d|          D ]}	t          j                            |d                     | |	                    }t3          |d          5 }t-          j        ||           ddd           n# 1 swxY w Y   t          j        |           	 ddd           n# 1 swxY w Y   n
d                     t          |           }
d                     |           }t          j                            ||          }t          j	        |
|t                     t          |          t          |           k    rYt           j                            d           t           j                                         t(                              d|            n#t-          j        |           t1          d          t          j                            |          rt          j        |           t-          j        ||           t          j        ||           dS )zDownload and extract the dataset/model.

    Parameters
    ----------
    name: str
        Dataset/model name which has to be downloaded.

    Raises
    ------
    Exception
        If md5sum on client and in repo are different.

    z{base}/{fname}/__init__.py)baserC   _tmpz__init__.pyr   z
{fname}.gz)rC   r   z!{base}/{fname}/{fname}.gz_0{part})rv   rC   r   z{f}.gz_0{p})r;   p)r   r   )
reporthook
zPart %s/%s downloadedz%Checksum comparison failed, try againrH   z{fname}.gz_0{part})rC   r   r8   Nz{base}/{fname}/{fname}.gzz%s downloaded)r3   DOWNLOAD_BASE_URLr(   r)   rK   r+   tempfilemkdtempurlliburlretrievert   ranger   r%   rF   ro   r   r   r   r   r,   r-   shutilrmtreer2   r?   copyfileobjremoveexistsmoverename)rc   url_load_filedata_folder_dirdata_folder_dir_tmptmp_dir	init_pathr   concatenated_folder_nameconcatenated_folder_dirr   url_datarC   dst_pathwfp	part_pathrfps                   r$   	_downloadr   Z  sA    177=NVZ7[[Mgll8T22O)F2  GWm44I
}i000T""KQ $E#/#6#6T#6#B#B "$',,w8P"Q"Q![)) 	I 	ID:AAGX`dkoAppH!((44(88Ew||GU33H("94[QQQ    'x00M$4M4MM I
  &&&
  """3TAX{KKKKg&&& GHHH)400 	%Ca-- % %GLL2F2M2MTX_c2M2d2dee	)T** 1c&sC0001 1 1 1 1 1 1 1 1 1 1 1 1 1 1	)$$$$	%	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% /55;LTX5YY##$#//7<<//8X)DDDD"8,,d0C0CC 	EJT"""JKK....M'"""CDDD	w~~)** '
	%&&&
K,---I!?33333s7    AK8JKJK!J"KKKc                     t                      }|d         }|d         }| |v r|d         |          d         S | |v r|d         |          d         S dS )zRetrieve the filename of the dataset/model.

    Parameters
    ----------
    name: str
        Name of dataset/model.

    Returns
    -------
    str:
        Filename of the dataset/model.

    rY   rZ   	file_nameNrr   rs   s       r$   _get_filenamer     sm     &&K)$G"Fw 89%d+K88	 88$T*;778 8r&   c                    t                       t          |           }|t          d          t          j                            t          |           }t          j                            ||          }t          j                            |          st          |            |r|S t          j        
                    dt                     t          |           }|                                S )a  Download (if needed) dataset/model and load it to memory (unless `return_path` is set).

    Parameters
    ----------
    name: str
        Name of the model/dataset.
    return_path: bool, optional
        If True, return full path to file, otherwise, return loaded model / iterable dataset.

    Returns
    -------
    Model
        Requested model, if `name` is model and `return_path` == False.
    Dataset (iterable)
        Requested dataset, if `name` is dataset and `return_path` == False.
    str
        Path to file with dataset / model, only when `return_path` == True.

    Raises
    ------
    Exception
        Raised if `name` is incorrect.

    Examples
    --------
    Model example:

    .. sourcecode:: pycon

        >>> import gensim.downloader as api
        >>>
        >>> model = api.load("glove-twitter-25")  # load glove vectors
        >>> model.most_similar("cat")  # show words that similar to word 'cat'

    Dataset example:

    .. sourcecode:: pycon

        >>> import gensim.downloader as api
        >>>
        >>> wiki = api.load("wiki-en")  # load extracted Wikipedia dump, around 6 Gb
        >>> for article in wiki:  # iterate over all wiki script
        >>>     pass

    Download only example:

    .. sourcecode:: pycon

        >>> import gensim.downloader as api
        >>>
        >>> print(api.load("wiki-en", return_path=True))  # output: /home/user/gensim-data/wiki-en/wiki-en.gz

    Nr[   r   )r6   r   rQ   r(   r)   rK   r+   r   r   r   insert
__import__	load_data)rc   return_pathr   
folder_dirr)   modules         r$   rP   rP     s    l d##I 86777h--J7<<
I..D7>>*%% $ "8$$$D!!!!!r&   __main__z4%(asctime)s : %(name)s : %(levelname)s : %(message)s)r3   streamlevelzGensim console APIzCpython -m gensim.api.downloader  [-h] [-d data_name | -i data_name])descriptionusagez-dz
--download	data_namezFTo download a corpus/model : python -m gensim.downloader -d <dataname>)metavarnargshelpz-iz--info?zSTo get information about a corpus/model : python -m gensim.downloader -i <dataname>)r   r   constr   )r   z+Data has been installed and data path is %src   )rk      )indent)rc   )r   r   )NTF)N)F)?__doc__
__future__r   argparser(   rN   rO   loggingr   r0   r=   r   r   r|   	functoolsr   version_infor~   urllib2r   urllib.requestrequestr)   
expanduser_DEFAULT_BASE_DIRenvironr`   r+   abspathrK   r4   base_dir	getLogger__name__r,   DATA_LIST_URLr{   r%   r6   rF   rW   r-   ro   rt   r   r   rP   basicConfigr   INFOArgumentParserparseradd_mutually_exclusive_groupgroupadd_argumentfull_information
parse_argsargsdownload	data_pathprintdumpsoutput r&   r$   <module>r      s  3 3j ' & & & & &  				 				   



           A! 'MMM######&&&&&& G&&77 :>>+->?? goobgll8T::;; 		8	$	$bX * * * *Z  8     * !7 $
 $
 $
 $
N= = = =@K K K K>4 4 4.?4 ?4 ?4D8 8 8.D" D" D" D"N z 0GEcj`g`l    %X$(S  F
 //11E	lKqU    
 	h3>Nb    
 D} 0Dq)t<<<	A9MMMMM	 09 	0E*$*TTD111!<<<===== $	-= =XTTVVVDDdiDXDXDXFE*$*VA.../////=0 020 0r&   