Source code for itemloaders

"""
Item Loader

See documentation in docs/topics/loaders.rst
"""

from __future__ import annotations

from contextlib import suppress
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    MutableMapping,
    Optional,
    Pattern,
    Union,
)

from itemadapter import ItemAdapter
from parsel import Selector
from parsel.utils import extract_regex, flatten

from itemloaders.common import wrap_loader_context
from itemloaders.processors import Identity
from itemloaders.utils import arg_to_iter

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self


def unbound_method(method: Callable[..., Any]) -> Callable[..., Any]:
    """
    Allow to use single-argument functions as input or output processors
    (no need to define an unused first 'self' argument)
    """
    with suppress(AttributeError):
        if "." not in method.__qualname__:
            return method.__func__  # type: ignore[attr-defined, no-any-return]
    return method


class ItemLoader:
    """
    Return a new Item Loader for populating the given item. If no item is
    given, one is instantiated automatically using the class in
    :attr:`default_item_class`.

    When instantiated with a :param ``selector`` parameter the :class:`ItemLoader` class
    provides convenient mechanisms for extracting data from web pages
    using parsel_ selectors.

    :param item: The item instance to populate using subsequent calls to
        :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
        :meth:`~ItemLoader.add_jmes` or :meth:`~ItemLoader.add_value`.
    :type item: :class:`dict` object

    :param selector: The selector to extract data from, when using the
        :meth:`add_xpath` (resp. :meth:`add_css`, :meth:`add_jmes`) or :meth:`replace_xpath`
        (resp. :meth:`replace_css`, :meth:`replace_jmes`) method.
    :type selector: :class:`~parsel.selector.Selector` object

    The item, selector and the remaining keyword arguments are
    assigned to the Loader context (accessible through the :attr:`context` attribute).

    .. attribute:: item

        The item object being parsed by this Item Loader.
        This is mostly used as a property so when attempting to override this
        value, you may want to check out :attr:`default_item_class` first.

    .. attribute:: context

        The currently active :ref:`Context <loaders-context>` of this Item Loader.
        Refer to <loaders-context> for more information about the Loader Context.

    .. attribute:: default_item_class

        An Item class (or factory), used to instantiate items when not given in
        the ``__init__`` method.

        .. warning:: Currently, this factory/class needs to be
            callable/instantiated without any arguments.
            If you are using ``dataclasses``, please consider the following
            alternative::

                from dataclasses import dataclass, field
                from typing import Optional

                @dataclass
                class Product:
                    name: Optional[str] = field(default=None)
                    price: Optional[float] = field(default=None)

    .. attribute:: default_input_processor

        The default input processor to use for those fields which don't specify
        one.

    .. attribute:: default_output_processor

        The default output processor to use for those fields which don't specify
        one.

    .. attribute:: selector

        The :class:`~parsel.selector.Selector` object to extract data from.
        It's the selector given in the ``__init__`` method.
        This attribute is meant to be read-only.

    .. _parsel: https://parsel.readthedocs.io/en/latest/
    """

    default_item_class: type = dict
    default_input_processor: Callable[..., Any] = Identity()
    default_output_processor: Callable[..., Any] = Identity()

    def __init__(
        self,
        item: Any = None,
        selector: Optional[Selector] = None,
        parent: Optional[ItemLoader] = None,
        **context: Any,
    ):
        self.selector: Optional[Selector] = selector
        context.update(selector=selector)
        if item is None:
            item = self.default_item_class()
        self._local_item = item
        context["item"] = item
        self.context: MutableMapping[str, Any] = context
        self.parent: Optional[ItemLoader] = parent
        self._local_values: Dict[str, List[Any]] = {}
        # values from initial item
        for field_name, value in ItemAdapter(item).items():
            self._values.setdefault(field_name, [])
            self._values[field_name] += arg_to_iter(value)

    @property
    def _values(self) -> Dict[str, List[Any]]:
        if self.parent is not None:
            return self.parent._values
        else:
            return self._local_values

    @property
    def item(self) -> Any:
        if self.parent is not None:
            return self.parent.item
        else:
            return self._local_item

[docs] def nested_xpath(self, xpath: str, **context: Any) -> Self: """ Create a nested loader with an xpath selector. The supplied selector is applied relative to selector associated with this :class:`ItemLoader`. The nested loader shares the item with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. """ self._check_selector_method() assert self.selector is not None selector = self.selector.xpath(xpath) context.update(selector=selector) subloader = self.__class__(item=self.item, parent=self, **context) return subloader
[docs] def nested_css(self, css: str, **context: Any) -> Self: """ Create a nested loader with a css selector. The supplied selector is applied relative to selector associated with this :class:`ItemLoader`. The nested loader shares the item with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. """ self._check_selector_method() assert self.selector is not None selector = self.selector.css(css) context.update(selector=selector) subloader = self.__class__(item=self.item, parent=self, **context) return subloader
[docs] def add_value( self, field_name: Optional[str], value: Any, *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Process and then add the given ``value`` for the given field. The value is first passed through :meth:`get_value` by giving the ``processors`` and ``kwargs``, and then passed through the :ref:`field input processor <processors>` and its result appended to the data collected for that field. If the field already contains collected data, the new data is added. The given ``field_name`` can be ``None``, in which case values for multiple fields may be added. And the processed value should be a dict with field_name mapped to values. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader Examples:: loader.add_value('name', 'Color TV') loader.add_value('colours', ['white', 'blue']) loader.add_value('length', '100') loader.add_value('name', 'name: foo', TakeFirst(), re='name: (.+)') loader.add_value(None, {'name': 'foo', 'sex': 'male'}) """ value = self.get_value(value, *processors, re=re, **kw) if value is None: return self if not field_name: for k, v in value.items(): self._add_value(k, v) else: self._add_value(field_name, value) return self
[docs] def replace_value( self, field_name: Optional[str], value: Any, *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`add_value` but replaces the collected data with the new value instead of adding it. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader """ value = self.get_value(value, *processors, re=re, **kw) if value is None: return self if not field_name: for k, v in value.items(): self._replace_value(k, v) else: self._replace_value(field_name, value) return self
def _add_value(self, field_name: str, value: Any) -> None: value = arg_to_iter(value) processed_value = self._process_input_value(field_name, value) if processed_value: self._values.setdefault(field_name, []) self._values[field_name] += arg_to_iter(processed_value) def _replace_value(self, field_name: str, value: Any) -> None: self._values.pop(field_name, None) self._add_value(field_name, value)
[docs] def get_value( self, value: Any, *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Any: """ Process the given ``value`` by the given ``processors`` and keyword arguments. Available keyword arguments: :param re: a regular expression to use for extracting data from the given value using :func:`~parsel.utils.extract_regex` method, applied before processors :type re: str or typing.Pattern[str] Examples: >>> from itemloaders import ItemLoader >>> from itemloaders.processors import TakeFirst >>> loader = ItemLoader() >>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)') 'FOO' """ if re: value = arg_to_iter(value) value = flatten(extract_regex(re, x) for x in value) for proc in processors: if value is None: break _proc = proc proc = wrap_loader_context(proc, self.context) try: value = proc(value) except Exception as e: raise ValueError( "Error with processor %s value=%r error='%s: %s'" % (_proc.__class__.__name__, value, type(e).__name__, str(e)) ) from e return value
[docs] def load_item(self) -> Any: """ Populate the item with the data collected so far, and return it. The data collected is first passed through the :ref:`output processors <processors>` to get the final value to assign to each item field. """ adapter = ItemAdapter(self.item) for field_name in tuple(self._values): value = self.get_output_value(field_name) if value is not None: adapter[field_name] = value return adapter.item
[docs] def get_output_value(self, field_name: str) -> Any: """ Return the collected values parsed using the output processor, for the given field. This method doesn't populate or modify the item at all. """ proc = self.get_output_processor(field_name) proc = wrap_loader_context(proc, self.context) value = self._values.get(field_name, []) try: return proc(value) except Exception as e: raise ValueError( "Error with output processor: field=%r value=%r error='%s: %s'" % (field_name, value, type(e).__name__, str(e)) ) from e
[docs] def get_collected_values(self, field_name: str) -> List[Any]: """Return the collected values for the given field.""" return self._values.get(field_name, [])
def get_input_processor(self, field_name: str) -> Callable[..., Any]: proc = getattr(self, "%s_in" % field_name, None) if not proc: proc = self._get_item_field_attr( field_name, "input_processor", self.default_input_processor ) return unbound_method(proc) def get_output_processor(self, field_name: str) -> Callable[..., Any]: proc = getattr(self, "%s_out" % field_name, None) if not proc: proc = self._get_item_field_attr( field_name, "output_processor", self.default_output_processor ) return unbound_method(proc) def _get_item_field_attr( self, field_name: str, key: Any, default: Any = None ) -> Any: field_meta = ItemAdapter(self.item).get_field_meta(field_name) return field_meta.get(key, default) def _process_input_value(self, field_name: str, value: Any) -> Any: proc = self.get_input_processor(field_name) _proc = proc proc = wrap_loader_context(proc, self.context) try: return proc(value) except Exception as e: raise ValueError( "Error with input processor %s: field=%r value=%r " "error='%s: %s'" % ( _proc.__class__.__name__, field_name, value, type(e).__name__, str(e), ) ) from e def _check_selector_method(self) -> None: if self.selector is None: raise RuntimeError( "To use XPath or CSS selectors, %s " "must be instantiated with a selector" % self.__class__.__name__ )
[docs] def add_xpath( self, field_name: Optional[str], xpath: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a value, which is used to extract a list of strings from the selector associated with this :class:`ItemLoader`. See :meth:`get_xpath` for ``kwargs``. :param xpath: the XPath to extract data from :type xpath: str :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader Examples:: # HTML snippet: <p class="product-name">Color TV</p> loader.add_xpath('name', '//p[@class="product-name"]') # HTML snippet: <p id="price">the price is $1200</p> loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') """ values = self._get_xpathvalues(xpath, **kw) return self.add_value(field_name, values, *processors, re=re, **kw)
[docs] def replace_xpath( self, field_name: Optional[str], xpath: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`add_xpath` but replaces collected data instead of adding it. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader """ values = self._get_xpathvalues(xpath, **kw) return self.replace_value(field_name, values, *processors, re=re, **kw)
[docs] def get_xpath( self, xpath: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Any: """ Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. :param xpath: the XPath to extract data from :type xpath: str :param re: a regular expression to use for extracting data from the selected XPath region :type re: str or typing.Pattern[str] Examples:: # HTML snippet: <p class="product-name">Color TV</p> loader.get_xpath('//p[@class="product-name"]') # HTML snippet: <p id="price">the price is $1200</p> loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') """ values = self._get_xpathvalues(xpath, **kw) return self.get_value(values, *processors, re=re, **kw)
def _get_xpathvalues( self, xpaths: Union[str, Iterable[str]], **kw: Any ) -> List[Any]: self._check_selector_method() assert self.selector is not None xpaths = arg_to_iter(xpaths) return flatten(self.selector.xpath(xpath, **kw).getall() for xpath in xpaths)
[docs] def add_css( self, field_name: Optional[str], css: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`ItemLoader.add_value` but receives a CSS selector instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. See :meth:`get_css` for ``kwargs``. :param css: the CSS selector to extract data from :type css: str :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader Examples:: # HTML snippet: <p class="product-name">Color TV</p> loader.add_css('name', 'p.product-name') # HTML snippet: <p id="price">the price is $1200</p> loader.add_css('price', 'p#price', re='the price is (.*)') """ values = self._get_cssvalues(css) return self.add_value(field_name, values, *processors, re=re, **kw)
[docs] def replace_css( self, field_name: Optional[str], css: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`add_css` but replaces collected data instead of adding it. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader """ values = self._get_cssvalues(css) return self.replace_value(field_name, values, *processors, re=re, **kw)
[docs] def get_css( self, css: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Any: """ Similar to :meth:`ItemLoader.get_value` but receives a CSS selector instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. :param css: the CSS selector to extract data from :type css: str :param re: a regular expression to use for extracting data from the selected CSS region :type re: str or typing.Pattern[str] Examples:: # HTML snippet: <p class="product-name">Color TV</p> loader.get_css('p.product-name') # HTML snippet: <p id="price">the price is $1200</p> loader.get_css('p#price', TakeFirst(), re='the price is (.*)') """ values = self._get_cssvalues(css) return self.get_value(values, *processors, re=re, **kw)
def _get_cssvalues(self, csss: Union[str, Iterable[str]]) -> List[Any]: self._check_selector_method() assert self.selector is not None csss = arg_to_iter(csss) return flatten(self.selector.css(css).getall() for css in csss)
[docs] def add_jmes( self, field_name: Optional[str], jmes: str, *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`ItemLoader.add_value` but receives a JMESPath selector instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. See :meth:`get_jmes` for ``kwargs``. :param jmes: the JMESPath selector to extract data from :type jmes: str :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader Examples:: # HTML snippet: {"name": "Color TV"} loader.add_jmes('name') # HTML snippet: {"price": the price is $1200"} loader.add_jmes('price', TakeFirst(), re='the price is (.*)') """ values = self._get_jmesvalues(jmes) return self.add_value(field_name, values, *processors, re=re, **kw)
[docs] def replace_jmes( self, field_name: Optional[str], jmes: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Self: """ Similar to :meth:`add_jmes` but replaces collected data instead of adding it. :returns: The current ItemLoader instance for method chaining. :rtype: ItemLoader """ values = self._get_jmesvalues(jmes) return self.replace_value(field_name, values, *processors, re=re, **kw)
[docs] def get_jmes( self, jmes: Union[str, Iterable[str]], *processors: Callable[..., Any], re: Union[str, Pattern[str], None] = None, **kw: Any, ) -> Any: """ Similar to :meth:`ItemLoader.get_value` but receives a JMESPath selector instead of a value, which is used to extract a list of unicode strings from the selector associated with this :class:`ItemLoader`. :param jmes: the JMESPath selector to extract data from :type jmes: str :param re: a regular expression to use for extracting data from the selected JMESPath :type re: str or typing.Pattern Examples:: # HTML snippet: {"name": "Color TV"} loader.get_jmes('name') # HTML snippet: {"price": the price is $1200"} loader.get_jmes('price', TakeFirst(), re='the price is (.*)') """ values = self._get_jmesvalues(jmes) return self.get_value(values, *processors, re=re, **kw)
def _get_jmesvalues(self, jmess: Union[str, Iterable[str]]) -> List[Any]: self._check_selector_method() assert self.selector is not None jmess = arg_to_iter(jmess) if not hasattr(self.selector, "jmespath"): raise AttributeError( "Please install parsel >= 1.8.1 to get jmespath support" ) return flatten(self.selector.jmespath(jmes).getall() for jmes in jmess)