Source code for scrapy.spidermiddlewares.referer

"""
RefererMiddleware: populates Request referer field, based on the Response which
originated it.
"""
import warnings
from urllib.parse import urlparse

from w3lib.url import safe_url_string

from scrapy.http import Request, Response
from scrapy.exceptions import NotConfigured
from scrapy import signals
from scrapy.utils.python import to_unicode
from scrapy.utils.misc import load_object
from scrapy.utils.url import strip_url


LOCAL_SCHEMES = ('about', 'blob', 'data', 'filesystem',)

POLICY_NO_REFERRER = "no-referrer"
POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
POLICY_SAME_ORIGIN = "same-origin"
POLICY_ORIGIN = "origin"
POLICY_STRICT_ORIGIN = "strict-origin"
POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
POLICY_UNSAFE_URL = "unsafe-url"
POLICY_SCRAPY_DEFAULT = "scrapy-default"


class ReferrerPolicy(object):

    NOREFERRER_SCHEMES = LOCAL_SCHEMES

    def referrer(self, response_url, request_url):
        raise NotImplementedError()

    def stripped_referrer(self, url):
        if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
            return self.strip_url(url)

    def origin_referrer(self, url):
        if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
            return self.origin(url)

    def strip_url(self, url, origin_only=False):
        """
        https://www.w3.org/TR/referrer-policy/#strip-url

        If url is null, return no referrer.
        If url's scheme is a local scheme, then return no referrer.
        Set url's username to the empty string.
        Set url's password to null.
        Set url's fragment to null.
        If the origin-only flag is true, then:
            Set url's path to null.
            Set url's query to null.
        Return url.
        """
        if not url:
            return None
        return strip_url(url,
                         strip_credentials=True,
                         strip_fragment=True,
                         strip_default_port=True,
                         origin_only=origin_only)

    def origin(self, url):
        """Return serialized origin (scheme, host, path) for a request or response URL."""
        return self.strip_url(url, origin_only=True)

    def potentially_trustworthy(self, url):
        # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
        parsed_url = urlparse(url)
        if parsed_url.scheme in ('data',):
            return False
        return self.tls_protected(url)

    def tls_protected(self, url):
        return urlparse(url).scheme in ('https', 'ftps')


[docs]class NoReferrerPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer The simplest policy is "no-referrer", which specifies that no referrer information is to be sent along with requests made from a particular request client to any origin. The header will be omitted entirely. """ name = POLICY_NO_REFERRER def referrer(self, response_url, request_url): return None
[docs]class NoReferrerWhenDowngradePolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade The "no-referrer-when-downgrade" policy sends a full URL along with requests from a TLS-protected environment settings object to a potentially trustworthy URL, and requests from clients which are not TLS-protected to any origin. Requests from TLS-protected clients to non-potentially trustworthy URLs, on the other hand, will contain no referrer information. A Referer HTTP header will not be sent. This is a user agent's default behavior, if no policy is otherwise specified. """ name = POLICY_NO_REFERRER_WHEN_DOWNGRADE def referrer(self, response_url, request_url): if not self.tls_protected(response_url) or self.tls_protected(request_url): return self.stripped_referrer(response_url)
[docs]class SameOriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin The "same-origin" policy specifies that a full URL, stripped for use as a referrer, is sent as referrer information when making same-origin requests from a particular request client. Cross-origin requests, on the other hand, will contain no referrer information. A Referer HTTP header will not be sent. """ name = POLICY_SAME_ORIGIN def referrer(self, response_url, request_url): if self.origin(response_url) == self.origin(request_url): return self.stripped_referrer(response_url)
[docs]class OriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-origin The "origin" policy specifies that only the ASCII serialization of the origin of the request client is sent as referrer information when making both same-origin requests and cross-origin requests from a particular request client. """ name = POLICY_ORIGIN def referrer(self, response_url, request_url): return self.origin_referrer(response_url)
[docs]class StrictOriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin The "strict-origin" policy sends the ASCII serialization of the origin of the request client when making requests: - from a TLS-protected environment settings object to a potentially trustworthy URL, and - from non-TLS-protected environment settings objects to any origin. Requests from TLS-protected request clients to non- potentially trustworthy URLs, on the other hand, will contain no referrer information. A Referer HTTP header will not be sent. """ name = POLICY_STRICT_ORIGIN def referrer(self, response_url, request_url): if ((self.tls_protected(response_url) and self.potentially_trustworthy(request_url)) or not self.tls_protected(response_url)): return self.origin_referrer(response_url)
[docs]class OriginWhenCrossOriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin The "origin-when-cross-origin" policy specifies that a full URL, stripped for use as a referrer, is sent as referrer information when making same-origin requests from a particular request client, and only the ASCII serialization of the origin of the request client is sent as referrer information when making cross-origin requests from a particular request client. """ name = POLICY_ORIGIN_WHEN_CROSS_ORIGIN def referrer(self, response_url, request_url): origin = self.origin(response_url) if origin == self.origin(request_url): return self.stripped_referrer(response_url) else: return origin
[docs]class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin The "strict-origin-when-cross-origin" policy specifies that a full URL, stripped for use as a referrer, is sent as referrer information when making same-origin requests from a particular request client, and only the ASCII serialization of the origin of the request client when making cross-origin requests: - from a TLS-protected environment settings object to a potentially trustworthy URL, and - from non-TLS-protected environment settings objects to any origin. Requests from TLS-protected clients to non- potentially trustworthy URLs, on the other hand, will contain no referrer information. A Referer HTTP header will not be sent. """ name = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN def referrer(self, response_url, request_url): origin = self.origin(response_url) if origin == self.origin(request_url): return self.stripped_referrer(response_url) elif ((self.tls_protected(response_url) and self.potentially_trustworthy(request_url)) or not self.tls_protected(response_url)): return self.origin_referrer(response_url)
[docs]class UnsafeUrlPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer, is sent along with both cross-origin requests and same-origin requests made from a particular request client. Note: The policy's name doesn't lie; it is unsafe. This policy will leak origins and paths from TLS-protected resources to insecure origins. Carefully consider the impact of setting such a policy for potentially sensitive documents. """ name = POLICY_UNSAFE_URL def referrer(self, response_url, request_url): return self.stripped_referrer(response_url)
[docs]class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy): """ A variant of "no-referrer-when-downgrade", with the addition that "Referer" is not sent if the parent request was using ``file://`` or ``s3://`` scheme. """ NOREFERRER_SCHEMES = LOCAL_SCHEMES + ('file', 's3') name = POLICY_SCRAPY_DEFAULT
_policy_classes = {p.name: p for p in ( NoReferrerPolicy, NoReferrerWhenDowngradePolicy, SameOriginPolicy, OriginPolicy, StrictOriginPolicy, OriginWhenCrossOriginPolicy, StrictOriginWhenCrossOriginPolicy, UnsafeUrlPolicy, DefaultReferrerPolicy, )} # Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string _policy_classes[''] = NoReferrerWhenDowngradePolicy def _load_policy_class(policy, warning_only=False): """ Expect a string for the path to the policy class, otherwise try to interpret the string as a standard value from https://www.w3.org/TR/referrer-policy/#referrer-policies """ try: return load_object(policy) except ValueError: try: return _policy_classes[policy.lower()] except KeyError: msg = "Could not load referrer policy %r" % policy if not warning_only: raise RuntimeError(msg) else: warnings.warn(msg, RuntimeWarning) return None
[docs]class RefererMiddleware(object): def __init__(self, settings=None): self.default_policy = DefaultReferrerPolicy if settings is not None: self.default_policy = _load_policy_class( settings.get('REFERRER_POLICY')) @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('REFERER_ENABLED'): raise NotConfigured mw = cls(crawler.settings) # Note: this hook is a bit of a hack to intercept redirections crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled) return mw def policy(self, resp_or_url, request): """ Determine Referrer-Policy to use from a parent Response (or URL), and a Request to be sent. - if a valid policy is set in Request meta, it is used. - if the policy is set in meta but is wrong (e.g. a typo error), the policy from settings is used - if the policy is not set in Request meta, but there is a Referrer-policy header in the parent response, it is used if valid - otherwise, the policy from settings is used. """ policy_name = request.meta.get('referrer_policy') if policy_name is None: if isinstance(resp_or_url, Response): policy_header = resp_or_url.headers.get('Referrer-Policy') if policy_header is not None: policy_name = to_unicode(policy_header.decode('latin1')) if policy_name is None: return self.default_policy() cls = _load_policy_class(policy_name, warning_only=True) return cls() if cls else self.default_policy() def process_spider_output(self, response, result, spider): def _set_referer(r): if isinstance(r, Request): referrer = self.policy(response, r).referrer(response.url, r.url) if referrer is not None: r.headers.setdefault('Referer', referrer) return r return (_set_referer(r) for r in result or ()) def request_scheduled(self, request, spider): # check redirected request to patch "Referer" header if necessary redirected_urls = request.meta.get('redirect_urls', []) if redirected_urls: request_referrer = request.headers.get('Referer') # we don't patch the referrer value if there is none if request_referrer is not None: # the request's referrer header value acts as a surrogate # for the parent response URL # # Note: if the 3xx response contained a Referrer-Policy header, # the information is not available using this hook parent_url = safe_url_string(request_referrer) policy_referrer = self.policy(parent_url, request).referrer( parent_url, request.url) if policy_referrer != request_referrer: if policy_referrer is None: request.headers.pop('Referer') else: request.headers['Referer'] = policy_referrer