
    c=                        d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZmZmZmZmZmZ ddlZ ej        e          Zd-dZ	 	 d-d	Zd
 Zd.dZ G d de          Zedk    r^ ej        dej                    ej        ej        e dd                   Z  e!d ej"                    dz
            Z#e $                    dddd           e $                    ddd           e $                    ddde%e#            e $                    d!d"d#e%d            e $                    d$d%d&d'(           e &                                Z'e(                    d)d*)                    ej*                              ee'j+        e'j,        e'j-        e'j.        e'j/        +           e(                    d,ej*        d                    dS dS )/a  This script using for extracting plain text out of a raw Wikipedia dump. Input is an xml.bz2 file provided
by MediaWiki that looks like <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2
(e.g. 14 GB of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2).

It streams through all the XML articles using multiple cores (#cores - 1, by default),
decompressing on the fly and extracting plain text from the articles and their sections.

For each extracted article, it prints its title, section names and plain text section contents, in json-line format.

How to use
----------
#. Process Wikipedia dump with this script ::

    python -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz

#. Read output in simple way:

.. sourcecode:: pycon

    >>> from gensim import utils
    >>> import json
    >>>
    >>> # iterate over the plain text data we just created
    >>> with utils.open('enwiki-latest.json.gz', 'rb') as f:
    >>>     for line in f:
    >>>         # decode each JSON line into a Python dictionary object
    >>>         article = json.loads(line)
    >>>
    >>>         # each article has a "title", a mapping of interlinks and a list of "section_titles" and
    >>>         # "section_texts".
    >>>         print("Article title: %s" % article['title'])
    >>>         print("Interlinks: %s" + article['interlinks'])
    >>>         for section_title, section_text in zip(article['section_titles'], article['section_texts']):
    >>>             print("Section title: %s" % section_title)
    >>>             print("Section text: %s" % section_text)


Notes
-----
Processing the entire English Wikipedia dump takes 1.7 hours (about 3 million articles per hour,
or 10 MB of XML per second) on an 8 core Intel i7-7700 @3.60GHz.


Command line arguments
----------------------

.. program-output:: python -m gensim.scripts.segment_wiki --help
   :ellipsis: 0, -10

    N)ElementTree)partial)IGNORED_NAMESPACES
WikiCorpusfilter_wikifind_interlinksget_namespaceutils   Fc              #      K   t           j                            | d          5 }t          ||||          }d|_        |                                }|D ]}|V  	 ddd           dS # 1 swxY w Y   dS )aI  Extract article titles and sections from a MediaWiki bz2 database dump.

    Parameters
    ----------
    file_path : str
        Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
        or <LANG>wiki-latest-pages-articles.xml.bz2.

    min_article_character : int, optional
        Minimal number of character for article (except titles and leading gaps).

    workers: int or None
        Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None.

    include_interlinks: bool
        Whether or not interlinks should be included in the output

    Yields
    ------
    (str, list of (str, str), (Optionally) list of (str, str))
        Structure contains (title, [(section_heading, section_content), ...],
        (Optionally) [(interlink_article, interlink_text), ...]).

    rb)min_article_character	processesinclude_interlinksTN)gensimr
   open_WikiSectionsCorpusmetadataget_texts_with_sections)	file_pathr   workersr   xml_fileobjwiki_sections_corpuswiki_sections_textarticles           ;lib/python3.11/site-packages/gensim/scripts/segment_wiki.pysegment_all_articlesr   I   s      2 
		9d	+	+ {2/DPW1 3  3  3 )-%1IIKK) 	 	GMMMM	                 s   8A))A-0A-c                    |&t          t          j        dt          j                  }n t          j                            |d          }	 t          | |||          }t          |          D ]\  }}|d         |d         }
}	|r|d         }|	g g d}|r||d	<   |
D ];\  }}|d
                             |           |d                             |           <|dz   dz  dk    rt          
                    d|dz   |	           |                    t          j        |          dz                       d                     	 ||                                 dS dS # ||                                 w w xY w)a  Write article title and sections to `output_file` (or stdout, if output_file is None).

    The output format is one article per line, in json-line format with 4 fields::

        'title' - title of article,
        'section_titles' - list of titles of sections,
        'section_texts' - list of content from sections,
        (Optional) 'section_interlinks' - list of interlinks in the article.

    Parameters
    ----------
    file_path : str
        Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
        or <LANG>wiki-latest-pages-articles.xml.bz2.

    output_file : str or None
        Path to output file in json-lines format, or None for printing to stdout.

    min_article_character : int, optional
        Minimal number of character for article (except titles and leading gaps).

    workers: int or None
        Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None.

    include_interlinks: bool
        Whether or not interlinks should be included in the output
    Nbufferwb)r   r   r         )titlesection_titlessection_texts
interlinksr$   r%   i z"processed #%d articles (at %r now)
zutf-8)getattrsysstdoutr   r
   r   r   	enumerateappendloggerinfowritejsondumpsencodeclose)r   output_filer   r   r   outfilearticle_streamidxr   article_titlearticle_sectionsr&   output_datasection_headingsection_contents                  r   segment_and_write_all_articlesr=   m   s   :  7#*h
;;,##K66-i9NX_ASU U U%n55 	L 	LLC.5aj'!*+M! ($QZ
 '"$!# K
 " 7,6L)4D E E0,-44_EEEO,33ODDDDa6!Q& Z@#'=YYYMM4:k22T9AA'JJKKKK'	L,  	MMOOOOO	 	; 	MMOOOO	s   
C5E E3c              #     K   d t          j        | d          D             }t          |          }t          |j                  }d|i}d|z  }|D ]7}|j        |k    r*t          j        |          V  |                                 8dS )zExtract pages from a MediaWiki database dump.

    Parameters
    ----------
    f : file
        File descriptor of MediaWiki dump.

    Yields
    ------
    str
        XML strings for page tags.

    c              3       K   | ]	\  }}|V  
d S N ).0_elems      r   	<genexpr>z$extract_page_xmls.<locals>.<genexpr>   s&      KKgaTKKKKKK    )end)eventsnsz{%(ns)s}pageN)r   	iterparsenextr	   tagtostringclear)felemsrD   	namespace
ns_mappingpage_tags         r   extract_page_xmlsrT      s       LK!6q!J!J!JKKKE;;Ddh''I	"J
*H 
 
8x 		&t,,,,, JJLLL
 
rF   c                    t          j        |           }d}t          |j                  }d|i}d|z  }d|z  }d|z  }d}	d}
d}|                    |          j        }|                    |          j        }|                    |          j        }||vrd	}|n|rt          |          }t          j        |
|          }|	gt          j	        ||          z   }d
 |D             }t          |          t          |          k    sJ ng }g }g }d |D             }t          t          ||                    }|r|||fS ||fS )a  Parse the content inside a page tag

    Parameters
    ----------
    page_xml : str
        Content from page tag.

    include_interlinks : bool
        Whether or not interlinks should be parsed.

    Returns
    -------
    (str, list of (str, str), (Optionally) list of (str, str))
        Structure contains (title, [(section_heading, section_content), ...],
        (Optionally) [(interlink_article, interlink_text), ...]).

    0rI   z./{%(ns)s}revision/{%(ns)s}textz./{%(ns)s}titlez./{%(ns)s}nsIntroductionz\n==[^=].*[^=]==\nz\n==([^=].*[^=])==\nNc                 6    g | ]}|                                 S rA   )strip)rB   headings     r   
<listcomp>zsegment.<locals>.<listcomp>   s     LLLGMMOOLLLrF   c                 ,    g | ]}t          |          S rA   )r   )rB   r<   s     r   r\   zsegment.<locals>.<listcomp>   s     ]]]O44]]]rF   )r   
fromstringr	   rL   findtextr   resplitfindalllenlistzip)page_xmlr   rD   filter_namespacesrQ   rR   	text_path
title_pathns_pathlead_section_headingtop_level_heading_regextop_level_heading_regex_capturer#   r`   rI   r&   section_contentssection_headingssectionss                      r   segmentrr      s   $ !(++Ddh''I	"J1J>I"Z/Jz)G)3&=#IIj!!&E99Y$D	7			 B	""  
 	/(..J8$;TBB01BJ?^`d4e4eeLL;KLLL#$$,<(=(======
]]L\]]]C(*:;;<<H h
**hrF   c                   $    e Zd ZdZ	 	 ddZd ZdS )	r   a  Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
    or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.

    The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk.

    r   NrV   Fc                     |t          d          || _        || _        d| _        |$t	          dt          j                    dz
            }|| _        || _        || _	        dS )aE  
        Parameters
        ----------
        fileobj : file
            File descriptor of MediaWiki dump.
        min_article_character : int, optional
            Minimal number of character for article (except titles and leading gaps).
        processes : int, optional
            Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
        filter_namespaces : tuple of int, optional
            Enumeration of namespaces that will be ignored.
        include_interlinks: bool
            Whether or not interlinks should be included in the output

        NzThe lemmatize parameter is no longer supported since Gensim 4.0.0. If you need to lemmatize, use e.g. https://github.com/clips/pattern to preprocess your corpus before submitting it to Gensim.Fr!   )
NotImplementedErrorfileobjrh   r   maxmultiprocessing	cpu_countr   r   r   )selfrv   r   r   	lemmatizerh   r   s          r   __init__z_WikiSectionsCorpus.__init__  s    "  	%L   !2 	@A8::Q>??I"%:""4rF   c              #   \  K   d\  }}}d\  }}t          | j                  }t          j        | j                  }t          j        |d| j        z  d          D ]}|                    t          t          | j
                  |          D ]}	|	d         |	d         c}
t          fdt          D                       r|dz  }9|
rE|
d         d                                                                                             d	          r|dz  }t!          d
 |
D                       | j        k     r|dz  }|dz  }|t%          |
          z  }| j
        r|	d         }|
|fV  ܉|
fV  t&                              d|||||           |                                 || _        dS )a  Iterate over the dump, returning titles and text versions of all sections of articles.

        Notes
        -----
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function:

        .. sourcecode:: pycon

            >>> for vec in wiki_corpus:
            >>>     print(vec)

        Yields
        ------
        (str, list of (str, str), list of (str, str))
            Structure contains (title, [(section_heading, section_content), ...],
            (Optionally)[(interlink_article, interlink_text), ...]).

        )r   r   r   )r   r   
   r!   )	chunksizemaxsize)r   r   c              3   H   K   | ]}                     |d z             V  dS ):N)
startswith)rB   ignorer8   s     r   rE   z>_WikiSectionsCorpus.get_texts_with_sections.<locals>.<genexpr>R  s5      __&}//==______rF   z	#redirectc              3   ^   K   | ](\  }}t          |                                          V  )d S r@   )rd   rZ   )rB   rC   bodys      r   rE   z>_WikiSectionsCorpus.get_texts_with_sections.<locals>.<genexpr>X  s6      CCYas4::<<((CCCCCCrF   r"   zhfinished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)N)rT   rv   rx   Poolr   r
   chunkizeimapr   rr   r   anyr   lstriplowerr   sumr   rd   r-   r.   	terminatelength)rz   skipped_namespaceskipped_lengthskipped_redirecttotal_articlestotal_sections	page_xmlspoolgroupr   rq   r&   r8   s               @r   r   z+_WikiSectionsCorpus.get_texts_with_sections/  s     . ?F;>+;)-&%dl33	#DN33 ^Idn9LVWXXX 	4 	4E99WWI`%a%a%a%*, , 4 4*1!*gaj'x ____L^_____ %*% 8A;q>#8#8#:#:#@#@#B#B#M#Mk#Z#Z $)$CC(CCCCCdF`` "a'N!##h--/* 4!(J((J?????((33333-40 	vN,<nN_	a 	a 	a 	$rF   )r   NNrV   F)__name__
__module____qualname____doc__r|   r   rA   rF   r   r   r     sL          FJNS5 5 5 5B:% :% :% :% :%rF   r   __main__z6%(asctime)s - %(module)s - %(levelname)s - %(message)s)formatlevelix)formatter_classdescriptionr!   z-fz--filez,Path to MediaWiki database dump (read-only).T)helprequiredz-oz--outputzPath to output file (stdout if not specified). If ends in .gz or .bz2, the output file will be automatically compressed (recommended!).)r   z-wz	--workerszHNumber of parallel workers for multi-core systems. Default: %(default)s.)r   typedefaultz-mz--min-article-characterzVIgnore articles with fewer characters than this (article stubs). Default: %(default)s.z-iz--include-interlinkszInclude a mapping for interlinks to other articles in the dump. The mappings format is: "interlinks": [("article_title_1", "interlink_text_1"), ("article_title_2", "interlink_text_2"), ...]
store_true)r   actionz
running %s )r   r   r   zfinished running %s)r   NF)F)0r   argparser0   loggingrx   ra   r)   	xml.etreer   	functoolsr   gensim.corpora.wikicorpusr   r   r   r   r	   r
   gensim.utilsr   	getLoggerr   r-   r   r=   rT   rr   r   basicConfigINFOArgumentParserRawTextHelpFormatterparserrw   ry   default_workersadd_argumentint
parse_argsargsr.   joinargvfileoutputr   r   r   rA   rF   r   <module>r      s(  1 1f        				 



 ! ! ! ! ! !       x x x x x x x x x x x x x x x x    		8	$	$! ! ! !H _c6;< < < <~  D5 5 5 5pc% c% c% c% c%* c% c% c%L z %4GW_f_kllll$X$X5R`ghmimhm`noooFc!6_6881<==O
h-[fjkkk
jP  Q Q Q kW	     'e	     $u	     D
KKchhsx00111""	4;"82	    KK%sx{33333K%4 %4rF   