
    c                        d Z ddlZddlZddlZddlmZmZmZm	Z	 ddl
mZ dZedk    rej                            ej        d                   Z ej        e          Z ej        d           ej                            ej        	           e                    d
d                    ej                              eej                  dk     r4 e e            d          e            z              ej        d           ej        dd         \  ZZ ej        !                    ej        "                    e                     s e#d           eej                  dk    r e$ej        d                   Z%neZ%dev Z&devZ'e&r ee%e'          Z(de(_)         e	ee(          Z* ej+        e dz   e*dd           e(,                    dde           e(-                    e dz              e*.                    e dz              de(_)        no e	e          Z*e*j(        ,                    dde            ej+        e dz   e*dd           e*j(        -                    e dz               ej/        e dz             Z([* ee dz             Z0 ee0e(d          Z1e1.                    e dz               ej+        e d z   e1e0         d!           e                    d"e           dS dS )#a  
USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]

Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
bz2-compressed dump of Wikipedia articles, in XML format.

This actually creates several files:

* `OUTPUT_PREFIX_wordids.txt.bz2`: mapping between words and their integer ids
* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation in Matrix Market format
* `OUTPUT_PREFIX_bow.mm.index`: index for `OUTPUT_PREFIX_bow.mm`
* `OUTPUT_PREFIX_bow.mm.metadata.cpickle`: titles of documents
* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation in Matrix Market format
* `OUTPUT_PREFIX_tfidf.mm.index`: index for `OUTPUT_PREFIX_tfidf.mm`
* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model

The output Matrix Market files can then be compressed (e.g., by bzip2) to save
disk space; gensim's corpus iterators can work with compressed input, too.

`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
removing tokens that appear in more than 10%% of all documents). Defaults to
100,000.

Example:
  python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
    N)
DictionaryHashDictionaryMmCorpus
WikiCorpus)
TfidfModeli __main__z)%(asctime)s : %(levelname)s : %(message)s)format)levelz
running %s    __doc__   zOError: The output directory does not exist. Create the directory and try again.onlinenodebug)id_rangedebugT)
dictionaryz_bow.mmi'  )progress_cntmetadata   g?)no_belowno_abovekeep_nz_wordids.txt.bz2z_corpus.pkl.bz2F)id2word	normalizez.tfidf_modelz	_tfidf.mm)r   zfinished running %s)2r   loggingos.pathossysgensim.corporar   r   r   r   gensim.modelsr   DEFAULT_DICT_SIZE__name__pathbasenameargvprogram	getLoggerloggerbasicConfigrootsetLevelINFOinfojoinlenprintglobalslocalsexitinpoutpisdirdirname
SystemExitint
keep_wordsr   r   r   allow_updatewiki	serializefilter_extremessave_as_textsaveload_from_textmmtfidf     8lib/python3.11/site-packages/gensim/scripts/make_wiki.py<module>rH      s   8   



 K K K K K K K K K K K K $ $ $ $ $ $   z :0gsx{++GWw''FGJKKKKL---
KKchhsx00111 s38}}q ggii	"VVXX-...1IC7==..// ljjkkk
s38}}q 'S!%%

&
 FW$E J#^ZuEEE
"&
z#*5554)+TPTUUUU""BEV"WWW'9 9:::		$**+++"'
z#''cJ['\\\4)+TPTUUUU$$T,>%>??? /Z.t6H/HII
 
$"	#	#B Jr:>>>E	JJtn$%%% Htk)595IIII
KK%w/////u:0 :0rF   