
    NgmL                       d dl mZ d dlZd dlZd dlZd dlmZmZmZm	Z	 d dl
mZmZmZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7 erd dl8m9Z9m:Z: d dl;m<Z< d dl=m>Z>  ej~                  e@      ZA ed      ZB G d d      ZC G d d      ZD G d deD      ZEy)    )annotationsN)TYPE_CHECKINGAnyTypeVarcast)DeferredDeferredListinlineCallbacksmaybeDeferred)verifyClass)Spidersignals)AddonManagerExecutionEngine)ExtensionManager)ISpiderLoader)LogFormatter)BaseSettingsSettingsoverridden_settings)SignalManager)StatsCollector)LogCounterHandlerconfigure_loggingget_scrapy_root_handlerinstall_scrapy_root_handlerlog_reactor_infolog_scrapy_info)build_from_crawlerload_object)install_shutdown_handlerssignal_names)install_reactoris_asyncio_reactor_installed#verify_installed_asyncio_event_loopverify_installed_reactor)	GeneratorIterable)SpiderLoader)RequestFingerprinter_Tc                      e Zd Z	 	 d	 	 	 	 	 ddZddZddZedd       ZddZddZ	edd       Z
e	 	 	 	 	 	 dd	       Zdd
ZddZddZddZddZy)CrawlerNc                   t        |t              rt        d      t        |t              s|t	        |      }|| _        |j                         | _        | j
                  j                  | j                         | j                          t        |       | _        t        |       | _        || _        d| _        d| _        d | _        d | _        d | _        d | _        d | _        d | _        y )Nz5The spidercls argument must be a class, not an objectF)
isinstancer   
ValueErrordictr   	spiderclscopysettingsupdate_settings_update_root_log_handlerr   addonsr   r   _init_reactorcrawling_started
extensionsstatslogformatterrequest_fingerprinterspiderengine)selfr3   r5   init_reactors       .lib/python3.12/site-packages/scrapy/crawler.py__init__zCrawler.__init__7   s     i(TUUh%)9)H'0"*--/&&t}}5%%'$0$6&3D&9#/##37,0
15BF"%).2    c                D    t               t        | j                         y y N)r   r   r5   rB   s    rD   r7   z Crawler._update_root_log_handlerV   s    "$0'6 1rF   c                \   | j                   j                  ry | j                  j                  | j                           t	        | j                   d         |       | _        t        | | j                   j                  d            t        j                  j                         fd| _        | j                  j                  | j                  t        j                         t	        | j                   d         }|j                  |       | _        t#        t	        | j                   d         |       | _        | j                   d   }| j                   d   }| j&                  r|rt)        ||       nd	d
lm} t/                |r,t1        |       t3               r|rt5        |       t/                t7        j                  |       | _        | j                   j;                          t=        t?        | j                               }t@        jC                  ddtE        jF                  |      i       y )NSTATS_CLASS	LOG_LEVEL)levelc                 B    t         j                  j                         S rH   )loggingrootremoveHandler)handlers   rD   <lambda>z)Crawler._apply_settings.<locals>.<lambda>f   s    (B(B7(KrF   LOG_FORMATTERREQUEST_FINGERPRINTER_CLASSTWISTED_REACTORASYNCIO_EVENT_LOOPr   reactorz!Overridden settings:
%(settings)sr5   )$r5   frozenr8   load_settingsr!   r=   r   getrO   rP   
addHandler_Crawler__remove_handlerr   connectengine_stoppedfrom_crawlerr>   r    r?   r9   r$   twisted.internetrY   r   r'   r%   r&   r   r<   freezer2   r   loggerinfopprintpformat)rB   lf_clsreactor_class
event_looprY   drR   s         @rD   _apply_settingszCrawler._apply_settings[   s   ==!!$--0>[}!=>tD
#D0A0A+0NO( !LT22G4J4JK%01O%P"//5%7&CDE&
"
 "]]+<=--(<=
 z:4$]3+-*3J?*77=$T]]340:v~~a?P2Q	
rF   c              /  p  K   | j                   rt        d      | j                  rt        d      dx| _         | _        	  | j                  |i || _        | j                          | j                          | j                         | _        t        | j                  j                               }| j                  j                  | j                  |       t        | j                  j                         y # t        $ r1 d| _         | j                  | j                  j                           w xY ww)NzCrawling already taking placez?Cannot run Crawler.crawl() more than once on the same instance.TF)r:   RuntimeErrorr;   _create_spiderr@   rl   r7   _create_enginerA   iterstart_requestsopen_spiderr   start	Exceptionclose)rB   argskwargsrr   s       rD   crawlzCrawler.crawl   s    ==>??==Q  )-,	-$--t>v>DK  "))+--/DK!$++"<"<">?N++))$++~FF 1 122 	!DM{{&kk''))		s   =D6 B8C9 8D69:D33D6c                B     | j                   j                  | g|i |S rH   )r3   ra   )rB   rw   rx   s      rD   ro   zCrawler._create_spider   s#    *t~~**4A$A&AArF   c                "     t          fd      S )Nc                $    j                         S rH   )stop)_rB   s    rD   rS   z(Crawler._create_engine.<locals>.<lambda>   s    tyy{rF   r   rI   s   `rD   rp   zCrawler._create_engine   s    t%:;;rF   c              #     K   | j                   r7d| _         | j                  sJ t        | j                  j                         yyw)zoStarts a graceful stop of the crawler and returns a deferred that is
        fired when the crawler is stopped.FN)r:   rA   r   r}   rI   s    rD   r}   zCrawler.stop   s<      ==!DM;;; 0 011 s   AAc                2    |D ]  }t        ||       s|c S  y rH   )r0   )component_class
components	components      rD   _get_componentzCrawler._get_component   s#     $I)_5   $ rF   c                N    | j                  || j                  j                        S )zReturn the run-time instance of an :ref:`add-on <topics-addons>` of
        the specified class or a subclass, or ``None`` if none is found.

        .. versionadded:: 2.12
        )r   r8   rB   clss     rD   	get_addonzCrawler.get_addon   s!     ""3(:(:;;rF   c                    | j                   st        d      | j                  || j                   j                  j                  j
                        S )ax  Return the run-time instance of a :ref:`downloader middleware
        <topics-downloader-middleware>` of the specified class or a subclass,
        or ``None`` if none is found.

        .. versionadded:: 2.12

        This method can only be called after the crawl engine has been created,
        e.g. at signals :signal:`engine_started` or :signal:`spider_opened`.
        z_Crawler.get_downloader_middleware() can only be called after the crawl engine has been created.)rA   rn   r   
downloader
middlewaremiddlewaresr   s     rD   get_downloader_middlewarez!Crawler.get_downloader_middleware   sH     {{5  ""3(>(>(I(I(U(UVVrF   c                |    | j                   st        d      | j                  || j                   j                        S )ao  Return the run-time instance of an :ref:`extension
        <topics-extensions>` of the specified class or a subclass,
        or ``None`` if none is found.

        .. versionadded:: 2.12

        This method can only be called after the extension manager has been
        created, e.g. at signals :signal:`engine_started` or
        :signal:`spider_opened`.
        zXCrawler.get_extension() can only be called after the extension manager has been created.)r<   rn   r   r   r   s     rD   get_extensionzCrawler.get_extension   s<     6  ""3(C(CDDrF   c                    | j                   st        d      | j                  || j                   j                  j                  j
                        S )ah  Return the run-time instance of a :ref:`item pipeline
        <topics-item-pipeline>` of the specified class or a subclass, or
        ``None`` if none is found.

        .. versionadded:: 2.12

        This method can only be called after the crawl engine has been created,
        e.g. at signals :signal:`engine_started` or :signal:`spider_opened`.
        zWCrawler.get_item_pipeline() can only be called after the crawl engine has been created.)rA   rn   r   scraperitemprocr   r   s     rD   get_item_pipelinezCrawler.get_item_pipeline   H     {{1  ""3(;(;(D(D(P(PQQrF   c                    | j                   st        d      | j                  || j                   j                  j                  j
                        S )ap  Return the run-time instance of a :ref:`spider middleware
        <topics-spider-middleware>` of the specified class or a subclass, or
        ``None`` if none is found.

        .. versionadded:: 2.12

        This method can only be called after the crawl engine has been created,
        e.g. at signals :signal:`engine_started` or :signal:`spider_opened`.
        z[Crawler.get_spider_middleware() can only be called after the crawl engine has been created.)rA   rn   r   r   spidermwr   r   s     rD   get_spider_middlewarezCrawler.get_spider_middleware   r   rF   NF)r3   ztype[Spider]r5    dict[str, Any] | Settings | NonerC   bool)returnNone)rw   r   rx   r   r   #Generator[Deferred[Any], Any, None])rw   r   rx   r   r   r   )r   r   r   r   )r   type[_T]r   zIterable[Any]r   	_T | None)r   r   r   r   )__name__
__module____qualname__rE   r7   rl   r
   ry   ro   rp   r}   staticmethodr   r   r   r   r   r    rF   rD   r.   r.   6   s     6:"	33 33 	3>7
-
^  .B< 2 2 !/<	 <W"E$R"RrF   r.   c                      e Zd ZdZ ed d      Zedd       ZdddZ	 	 	 	 	 	 	 	 ddZ	dd	Z
	 	 	 	 dd
ZddZddZedd       Zy)CrawlerRunnera  
    This is a convenient helper class that keeps track of, manages and runs
    crawlers inside an already setup :mod:`~twisted.internet.reactor`.

    The CrawlerRunner object must be instantiated with a
    :class:`~scrapy.settings.Settings` object.

    This class shouldn't be needed (since Scrapy is responsible of using it
    accordingly) unless writing scripts that manually handle the crawling
    process. See :ref:`run-from-script` for an example.
    c                    | j                   S rH   )	_crawlersrI   s    rD   rS   zCrawlerRunner.<lambda>  s    T^^rF   zeSet of :class:`crawlers <scrapy.crawler.Crawler>` started by :meth:`crawl` and managed by this class.)docc                    | j                  d      }t        |      }t        t        |       t	        d|j                  | j                                     S )z'Get SpiderLoader instance from settingsSPIDER_LOADER_CLASSr*   )r\   r!   r   r   r   from_settings
frozencopy)r5   cls_path
loader_clss      rD   _get_spider_loaderz CrawlerRunner._get_spider_loader  sH     << 56 *
M:.NJ$<$<X=P=P=R$STTrF   Nc                    t        |t              s|t        |      }|| _        | j	                  |      | _        t               | _        t               | _        d| _	        y r   )
r0   r2   r   r5   r   spider_loadersetr   _activebootstrap_failed)rB   r5   s     rD   rE   zCrawlerRunner.__init__#  sO    h%)9)H"*+/+B+B8+L'*u,/E %rF   c                    t        |t              rt        d      | j                  |      } | j                  |g|i |S )ae  
        Run a crawler with the provided arguments.

        It will call the given Crawler's :meth:`~Crawler.crawl` method, while
        keeping track of it so it can be stopped later.

        If ``crawler_or_spidercls`` isn't a :class:`~scrapy.crawler.Crawler`
        instance, this method will try to create one using this parameter as
        the spider class given to it.

        Returns a deferred that is fired when the crawling is finished.

        :param crawler_or_spidercls: already created crawler, or a spider class
            or spider's name inside the project to create it
        :type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
            :class:`~scrapy.spiders.Spider` subclass or string

        :param args: arguments to initialize the spider

        :param kwargs: keyword arguments to initialize the spider
        lThe crawler_or_spidercls argument cannot be a spider object, it must be a spider class (or a Crawler object))r0   r   r1   create_crawler_crawl)rB   crawler_or_spiderclsrw   rx   crawlers        rD   ry   zCrawlerRunner.crawl,  sP    6 *F3B  %%&:;t{{74T4V44rF   c                      j                   j                          j                  |i | j                  j                         d fd}j	                  |      S )Nc                    j                   j                         j                  j                         xj                  t	        dd        z  c_        | S )Nr@   )crawlersdiscardr   r   getattr)resultr   rk   rB   s    rD   _donez#CrawlerRunner._crawl.<locals>._doneT  sI    MM!!'*LL  #!!(D)I%II!MrF   )r   r,   r   r,   )r   addry   r   addBoth)rB   r   rw   rx   r   rk   s   ``   @rD   r   zCrawlerRunner._crawlO  sR    '"GMM4*6*	 yyrF   c                ~    t        |t              rt        d      t        |t              r|S | j	                  |      S )a  
        Return a :class:`~scrapy.crawler.Crawler` object.

        * If ``crawler_or_spidercls`` is a Crawler, it is returned as-is.
        * If ``crawler_or_spidercls`` is a Spider subclass, a new Crawler
          is constructed for it.
        * If ``crawler_or_spidercls`` is a string, this function finds
          a spider with this name in a Scrapy project (using spider loader),
          then creates a Crawler instance for it.
        r   )r0   r   r1   r.   _create_crawler)rB   r   s     rD   r   zCrawlerRunner.create_crawler\  sG     *F3B  *G4''##$899rF   c                    t        |t              r| j                  j                  |      }t	        || j
                        S rH   )r0   strr   loadr.   r5   )rB   r3   s     rD   r   zCrawlerRunner._create_crawlerr  s3    i%**//	:Iy$--00rF   c                    t        t        | j                        D cg c]  }|j                          c}      S c c}w )z
        Stops simultaneously all the crawling jobs taking place.

        Returns a deferred that is fired when they all have ended.
        )r	   listr   r}   )rB   cs     rD   r}   zCrawlerRunner.stopw  s2     tDMM/BC/B!QVVX/BCDDCs   ;c              #  n   K   | j                   r%t        | j                          | j                   r$yyw)z
        join()

        Returns a deferred that is fired when all managed :attr:`crawlers` have
        completed their executions.
        N)r   r	   rI   s    rD   joinzCrawlerRunner.join  s'      llt||,, lls   055)r5   r   r   r*   rH   )r5   r   )r   type[Spider] | str | Crawlerrw   r   rx   r   r   Deferred[None])r   r.   rw   r   rx   r   r   r   )r   r   r   r.   )r3   zstr | type[Spider]r   r.   r   zDeferred[Any]r   )r   r   r   __doc__propertyr   r   r   rE   ry   r   r   r   r}   r
   r   r   rF   rD   r   r     s    
 #3H U U&!5:!5 !5 	!5
 
!5F :$@:	:,1
E - -rF   r   c                  p     e Zd ZdZ	 	 d		 	 	 d
 fdZddZddZddZ	 d	 	 	 	 	 ddZddZ	dddZ
 xZS )CrawlerProcessa  
    A class to run multiple scrapy crawlers in a process simultaneously.

    This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
    for starting a :mod:`~twisted.internet.reactor` and handling shutdown
    signals, like the keyboard interrupt command Ctrl-C. It also configures
    top-level logging.

    This utility should be a better fit than
    :class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
    :mod:`~twisted.internet.reactor` within your application.

    The CrawlerProcess object must be instantiated with a
    :class:`~scrapy.settings.Settings` object.

    :param install_root_handler: whether to install root logging handler
        (default: True)

    This class shouldn't be needed (since Scrapy is responsible of using it
    accordingly) unless writing scripts that manually handle the crawling
    process. See :ref:`run-from-script` for an example.
    c                    t         |   |       t        | j                  |       t	        | j                         d| _        y r   )superrE   r   r5   r   _initialized_reactor)rB   r5   install_root_handler	__class__s      rD   rE   zCrawlerProcess.__init__  s5    
 	"$--)=>&*/!rF   c                    ddl m} t        | j                         t        |   }t
        j                  dd|i       |j                  | j                         y )Nr   rX   zDReceived %(signame)s, shutting down gracefully. Send again to force signame)	rb   rY   r"   _signal_killr#   rd   re   callFromThread_graceful_stop_reactorrB   signumr~   rY   r   s        rD   _signal_shutdownzCrawlerProcess._signal_shutdown  sJ    ,!$"3"34v&R 	
 	t::;rF   c                    ddl m} t        t        j                         t
        |   }t        j                  dd|i       |j                  | j                         y )Nr   rX   z4Received %(signame)s twice, forcing unclean shutdownr   )
rb   rY   r"   signalSIG_IGNr#   rd   re   r   _stop_reactorr   s        rD   r   zCrawlerProcess._signal_kill  sH    ,!&..1v&BYPWDX	
 	t112rF   c                    t        |t              r| j                  j                  |      }| j                   }d| _        t        || j                  |      S )NT)rC   )r0   r   r   r   r   r.   r5   )rB   r3   rC   s      rD   r   zCrawlerProcess._create_crawler  sK    i%**//	:I444$(!y$--lKKrF   c                   ddl m} |r8| j                         }|j                  ry|j	                  | j
                         t        | j                  d         }t        || |      }|j                          |j                         }|j                  | j                  j                  d             |j                  dd| j                         |r"|j                  d	d
t        | j                          |j#                  |       y)ao  
        This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool
        size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache
        based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.

        If ``stop_after_crawl`` is True, the reactor will be stopped after all
        crawlers have finished, using :meth:`join`.

        :param bool stop_after_crawl: stop or not the reactor when all
            crawlers have finished

        :param bool install_signal_handlers: whether to install the OS signal
            handlers from Twisted and Scrapy (default: True)
        r   rX   NDNS_RESOLVERREACTOR_THREADPOOL_MAXSIZE)
maxthreadsbeforeshutdownafterstartup)installSignalHandlers)rb   rY   r   calledr   r   r!   r5   r    install_on_reactorgetThreadPooladjustPoolsizegetintaddSystemEventTriggerr}   r"   r   run)rB   stop_after_crawlinstall_signal_handlersrY   rk   resolver_classresolvertps           rD   rt   zCrawlerProcess.start  s    " 	-		AxxIId(()$T]]>%BC &ndGL##%""$
T]]%9%9:V%WX%%h
DIIF"))$=t?T?T 	*ABrF   c                \    | j                         }|j                  | j                         |S rH   )r}   r   r   )rB   rk   s     rD   r   z%CrawlerProcess._graceful_stop_reactor  s$    IIK			$$$%rF   c                P    ddl m} 	 |j                          y # t        $ r Y y w xY w)Nr   rX   )rb   rY   r}   rn   )rB   r~   rY   s      rD   r   zCrawlerProcess._stop_reactor  s%    ,	LLN 		s    	%%)NT)r5   r   r   r   )r   intr~   r   r   r   )r3   ztype[Spider] | strr   r.   )TT)r   r   r   r   r   r   r   rH   )r~   r   r   r   )r   r   r   r   rE   r   r   r   rt   r   r   __classcell__)r   s   @rD   r   r     so    2 6:%)020 #0	<3L NR&C $&CFJ&C	&CP
 rF   r   )F
__future__r   rO   rf   r   typingr   r   r   r   twisted.internet.deferr   r	   r
   r   zope.interface.verifyr   scrapyr   r   scrapy.addonsr   scrapy.core.enginer   scrapy.extensionr   scrapy.interfacesr   scrapy.logformatterr   scrapy.settingsr   r   r   scrapy.signalmanagerr   scrapy.statscollectorsr   scrapy.utils.logr   r   r   r   r   r   scrapy.utils.miscr    r!   scrapy.utils.ossignalr"   r#   scrapy.utils.reactorr$   r%   r&   r'   collections.abcr(   r)   scrapy.spiderloaderr*   scrapy.utils.requestr+   	getLoggerr   rd   r,   r.   r   r   r   rF   rD   <module>r     s    "    4 4  . " & . - + , G G . 1  > I  309 
		8	$T]OR ORd@- @-Fr] rrF   