Source code for scrapy.downloadermiddlewares.robotstxt

"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting.

"""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING, TypeVar

from twisted.internet.defer import Deferred, maybeDeferred

from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request, Response
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import load_object

if TYPE_CHECKING:
    from twisted.python.failure import Failure

    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy import Spider
    from scrapy.crawler import Crawler
    from scrapy.robotstxt import RobotParser


logger = logging.getLogger(__name__)

_T = TypeVar("_T")


[docs]class RobotsTxtMiddleware: DOWNLOAD_PRIORITY: int = 1000 def __init__(self, crawler: Crawler): if not crawler.settings.getbool("ROBOTSTXT_OBEY"): raise NotConfigured self._default_useragent: str = crawler.settings.get("USER_AGENT", "Scrapy") self._robotstxt_useragent: str | None = crawler.settings.get( "ROBOTSTXT_USER_AGENT", None ) self.crawler: Crawler = crawler self._parsers: dict[str, RobotParser | Deferred[RobotParser | None] | None] = {} self._parserimpl: RobotParser = load_object( crawler.settings.get("ROBOTSTXT_PARSER") ) # check if parser dependencies are met, this should throw an error otherwise. self._parserimpl.from_crawler(self.crawler, b"") @classmethod def from_crawler(cls, crawler: Crawler) -> Self: return cls(crawler) def process_request( self, request: Request, spider: Spider ) -> Deferred[None] | None: if request.meta.get("dont_obey_robotstxt"): return None if request.url.startswith("data:") or request.url.startswith("file:"): return None d: Deferred[RobotParser | None] = maybeDeferred( self.robot_parser, request, spider # type: ignore[call-overload] ) d2: Deferred[None] = d.addCallback(self.process_request_2, request, spider) return d2 def process_request_2( self, rp: RobotParser | None, request: Request, spider: Spider ) -> None: if rp is None: return useragent: str | bytes | None = self._robotstxt_useragent if not useragent: useragent = request.headers.get(b"User-Agent", self._default_useragent) assert useragent is not None if not rp.allowed(request.url, useragent): logger.debug( "Forbidden by robots.txt: %(request)s", {"request": request}, extra={"spider": spider}, ) assert self.crawler.stats self.crawler.stats.inc_value("robotstxt/forbidden") raise IgnoreRequest("Forbidden by robots.txt") def robot_parser( self, request: Request, spider: Spider ) -> RobotParser | Deferred[RobotParser | None] | None: url = urlparse_cached(request) netloc = url.netloc if netloc not in self._parsers: self._parsers[netloc] = Deferred() robotsurl = f"{url.scheme}://{url.netloc}/robots.txt" robotsreq = Request( robotsurl, priority=self.DOWNLOAD_PRIORITY, meta={"dont_obey_robotstxt": True}, callback=NO_CALLBACK, ) assert self.crawler.engine assert self.crawler.stats dfd = self.crawler.engine.download(robotsreq) dfd.addCallback(self._parse_robots, netloc, spider) dfd.addErrback(self._logerror, robotsreq, spider) dfd.addErrback(self._robots_error, netloc) self.crawler.stats.inc_value("robotstxt/request_count") parser = self._parsers[netloc] if isinstance(parser, Deferred): d: Deferred[RobotParser | None] = Deferred() def cb(result: RobotParser | None) -> RobotParser | None: d.callback(result) return result parser.addCallback(cb) return d return parser def _logerror(self, failure: Failure, request: Request, spider: Spider) -> Failure: if failure.type is not IgnoreRequest: logger.error( "Error downloading %(request)s: %(f_exception)s", {"request": request, "f_exception": failure.value}, exc_info=failure_to_exc_info(failure), extra={"spider": spider}, ) return failure def _parse_robots(self, response: Response, netloc: str, spider: Spider) -> None: assert self.crawler.stats self.crawler.stats.inc_value("robotstxt/response_count") self.crawler.stats.inc_value( f"robotstxt/response_status_count/{response.status}" ) rp = self._parserimpl.from_crawler(self.crawler, response.body) rp_dfd = self._parsers[netloc] assert isinstance(rp_dfd, Deferred) self._parsers[netloc] = rp rp_dfd.callback(rp) def _robots_error(self, failure: Failure, netloc: str) -> None: if failure.type is not IgnoreRequest: key = f"robotstxt/exception_count/{failure.type}" assert self.crawler.stats self.crawler.stats.inc_value(key) rp_dfd = self._parsers[netloc] assert isinstance(rp_dfd, Deferred) self._parsers[netloc] = None rp_dfd.callback(None)