"""
Depth Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any
from scrapy.spidermiddlewares.base import BaseSpiderMiddleware
if TYPE_CHECKING:
from collections.abc import AsyncIterable, Iterable
# typing.Self requires Python 3.11
from typing_extensions import Self
from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.http import Request, Response
from scrapy.statscollectors import StatsCollector
logger = logging.getLogger(__name__)
[docs]
class DepthMiddleware(BaseSpiderMiddleware):
crawler: Crawler
def __init__( # pylint: disable=super-init-not-called
self,
maxdepth: int,
stats: StatsCollector,
verbose_stats: bool = False,
prio: int = 1,
):
self.maxdepth = maxdepth
self.stats = stats
self.verbose_stats = verbose_stats
self.prio = prio
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
settings = crawler.settings
maxdepth = settings.getint("DEPTH_LIMIT")
verbose = settings.getbool("DEPTH_STATS_VERBOSE")
prio = settings.getint("DEPTH_PRIORITY")
assert crawler.stats
o = cls(maxdepth, crawler.stats, verbose, prio)
o.crawler = crawler
return o
def process_spider_output(
self, response: Response, result: Iterable[Any], spider: Spider
) -> Iterable[Any]:
self._init_depth(response, spider)
yield from super().process_spider_output(response, result, spider)
async def process_spider_output_async(
self, response: Response, result: AsyncIterable[Any], spider: Spider
) -> AsyncIterable[Any]:
self._init_depth(response, spider)
async for o in super().process_spider_output_async(response, result, spider):
yield o
def _init_depth(self, response: Response, spider: Spider) -> None:
# base case (depth=0)
if "depth" not in response.meta:
response.meta["depth"] = 0
if self.verbose_stats:
self.stats.inc_value("request_depth_count/0", spider=spider)
def get_processed_request(
self, request: Request, response: Response
) -> Request | None:
depth = response.meta["depth"] + 1
request.meta["depth"] = depth
if self.prio:
request.priority -= depth * self.prio
if self.maxdepth and depth > self.maxdepth:
logger.debug(
"Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
{"maxdepth": self.maxdepth, "requrl": request.url},
extra={"spider": self.crawler.spider},
)
return None
if self.verbose_stats:
self.stats.inc_value(
f"request_depth_count/{depth}", spider=self.crawler.spider
)
self.stats.max_value("request_depth_max", depth, spider=self.crawler.spider)
return request