The following implements a relatively generic and modular data loading utility. It loads images from a local storage given a lookup table. The data loader is parametrized via dependency injection to exhibit different behaviours. One behaviour is to derive variants for a loaded image.
I have a use case where we are still in an experimental phase and thus during the experimental development so far needed to support the option to swap out components for our algorithm, and we will need this option going forward when continuing experimentation. This requirement resulted in a design that broadly employs separation of concerns. This design arose incrementally during development after several refactorings to facilitate supporting the aforementioned modularity required during experimentation. Meaning, I did not front-load abstraction in expectation of needing it.
Another dev said he does not understand the code at all and would prefer a simpler design. As I said, the design arose due to necessities. Is this design really that opaque? To me, this code looks like run-of-the-mill OOP code with a few small closures thrown in, applying separation of concern principles / SOLID principles.
I would be interested in an independent evaluation of how complex this design really is. I am concerned with the high-level design, not the details. Hence, I ommitted some irrelevant implementation details.
(Don't mind the import order. I had to manually modify these when postig, to obscure some internal stuff)
file 1
(imported by file 2
) (obviously, these have descriptive names in reality)
"""
This module contains logic for loading reference data. It provides a reference data loader class whose behavior can be
controlled by dependency injection. Default implementations of dependencies are provided in this module as well.
"""
from __future__ import annotations
import abc
import importlib
from contextlib import contextmanager
from functools import lru_cache
from operator import itemgetter
from pathlib import Path
from typing import Any, TypeVar, Callable, Generic, Mapping, BinaryIO, Generator, Optional
import cv2
import numpy as np
from frozendict import frozendict
from funcy import lflatten
from .models import Item
from .exceptions import MissingReferenceData, DataLoadFailure
from .derivation import derive_variants
from .types import MatLike
T = TypeVar("T")
def make_id_based_reference_data_loader(
data_map: list[dict],
data_root: Path | str,
data_loader: Optional[DataLoader] = None,
) -> ReferenceDataLoader:
return ReferenceDataLoader(
reference_selector=ReferenceSelector(data_records=data_map, select_by="id"),
path_indexer=PathIndexer(root=data_root, path_extractor=index_by_product),
data_loader=data_loader or ImageToNumpyLoader(),
)
class ReferenceDataLoader(Generic[T]):
def __init__(self, reference_selector: ReferenceSelector, path_indexer: PathIndexer, data_loader: DataLoader):
"""Loads reference data for items.
Args:
reference_selector: Selects the reference descriptor for a given item. A reference
descriptor is a metadata record that specifies the reference data to load for a given item.
path_indexer: Indexes the path to the reference data from the reference descriptor.
data_loader: loads the data from the indexed path.
"""
self._reference_descriptor_selector = reference_selector
self._path_indexer = path_indexer
self._data_loader = data_loader
def select_and_load_reference_data(self, item: Item) -> list[T]:
"""Selects and loads the reference data for a given item.
Raises:
MissingReferenceData: If no reference data is found for the given item.
"""
reference_descriptor = self._reference_descriptor_selector.select_descriptor(item)
reference_data = self._load_reference_data(reference_descriptor)
return reference_data
@lru_cache
def _load_reference_data(self, reference_descriptor: frozendict) -> list[T]:
paths_to_reference_data = self._path_indexer.index_path(reference_descriptor)
if not paths_to_reference_data:
raise MissingReferenceData(f"No reference data found for reference descriptor: {reference_descriptor}")
reference_data = self._data_loader.load(*paths_to_reference_data)
return reference_data
class ReferenceSelector:
def __init__(self, data_records: list[dict], select_by: str):
"""Selects a reference descriptor for a given item.
Args:
data_records: A list of reference descriptors.
select_by: The key in the reference descriptor that is used to select the reference descriptor for a given
item.
"""
self._select_by = select_by
self._data_map = self._data_records_to_data_map(data_records)
def select_descriptor(self, item: Item) -> frozendict:
"""Selects the reference descriptor for a given item.
Args:
item: The item for which to select the reference descriptor.
Returns:
The reference descriptor for the given item.
Raises:
MissingReferenceData: If no reference descriptor is found for the given item.
"""
selection_value = getattr(item, self._select_by)
try:
return self._data_map[selection_value]
except KeyError:
raise MissingReferenceData(
f"No reference descriptor found for selection value '{selection_value}' and item '{item}'."
)
def _data_records_to_data_map(self, data_records: list[dict]) -> dict[Any, frozendict]:
"""Optimizes the data records for lookup by the selection key. Groups the records by the selection key
and mapping the selection key to the first item in each group (each group is assumed to have only one element).
Raises:
AmbiguousReferenceDataSelector: If any selection key in the reference data records is not unique.
"""
<omitted because irrelevant>
class PathIndexer:
def __init__(self, root: str | Path, path_extractor: Callable[[Mapping], list[Path]]):
"""Indexes the path to the reference data from a reference descriptor record.
Args:
root: The root path to the files to access.
path_extractor: A function that extracts the paths to the files from the reference descriptor relative to
the root path.
"""
self._root = Path(root)
self._path_extractor = path_extractor
def index_path(self, reference_descriptor: Mapping) -> list[Path]:
file_paths = [self._root / p for p in self._path_extractor(reference_descriptor)]
return file_paths
def index_by_product(reference_descriptor: Mapping) -> list[Path]:
brand, filenames = itemgetter("product", "files")(reference_descriptor)
root = Path(brand)
access_paths = [root / f for f in filenames]
return access_paths
class DataLoader(abc.ABC, Generic[T]):
def load(self, path: Path, *paths: Path) -> list[T]:
data_nested = self._load_multiple([path, *paths]) if paths else [self._load_one(path)]
# Loading for an individual path returns either a single object or a list of objects, hence flatten
data_flat = lflatten(data_nested)
return data_flat
def _load_one(self, path: Path) -> T:
path = normalize_path(path)
with _resolve_path(path) as fp:
try:
return self._load(fp)
except Exception as err:
raise DataLoadFailure(f"Failed to load reference data from path: {path}") from err
def _load_multiple(self, paths: list[Path]) -> list[T]:
return [self._load_one(path) for path in paths]
@abc.abstractmethod
def _load(self, fp: BinaryIO) -> T:
pass
@contextmanager
def _resolve_path(path: Path) -> Generator[BinaryIO, None, None]:
root = str(path.parent).replace("/", ".")
leaf = path.name
with importlib.resources.open_binary(root, leaf) as fp:
yield fp
def normalize_path(path: Path) -> Path:
root = str(path.parent).replace("/", ".")
leaf = path.name
path = Path(root.replace(".", "/")) / leaf
return path
class ImageToNumpyLoader(DataLoader):
def _load(self, binary_stream: BinaryIO) -> MatLike:
image = _read_image_bytes(binary_stream)
return image
class DerivativeImageToNumpyLoader(DataLoader):
def _load(self, binary_stream: BinaryIO) -> list[MatLike]:
image = _read_image_bytes(binary_stream)
variants = list(derive_variants(image))
return variants
def _read_image_bytes(binary_stream: BinaryIO) -> MatLike:
data = binary_stream.read()
np_arr = np.frombuffer(data, np.uint8)
image = cv2.imdecode(np_arr, cv2.IMREAD_GRAYSCALE)
return image
file 2
"""
This module contains functions for selecting the reference image for the algorithm.
"""
import json
import logging
from functools import partial
from importlib.resources import open_binary
from typing import Callable, Any, Optional
from funcy import lmapcat
from .models import Item
from .<file 1> import (
make_id_based_reference_data_loader,
DataLoader,
DerivativeImageToNumpyLoader,
ImageToNumpyLoader,
)
from .derivation import derive_variants
from .types import MatLike
logger = logging.getLogger(__name__)
def _build_reference_image_loader(data_loader: Optional[DataLoader] = None) -> Callable[[Item], list[MatLike]]:
def _load_reference_images(item: Item) -> list[MatLike]:
"""Loads the reference image for the given item.
Raises:
MissingReferenceData: If no reference data is found for the given item.
"""
# Might raise MissingReferenceData
return reference_data_loader.select_and_load_reference_data(item)
data_root = f"{__package__}.reference_data"
with open_binary(data_root, "reference_data_records.json") as fp:
reference_data_map = json.load(fp)
reference_data_loader = make_id_based_reference_data_loader(
data_map=reference_data_map, data_root=data_root, data_loader=data_loader
)
return _load_reference_images
def build_target_insensitive_deriving_reference_image_loader() -> Callable[[Item, Optional[Any]], list[MatLike]]:
"""Builds a data loader that applies derivation to reference images internally.
Since the data loader interface does not support passing target images, this applies derivation internally,
without considering the target images. The advantage is, that this allows the data loader to cache the derived
images for a specific item ID.
"""
_load_reference_images = _build_reference_image_loader(data_loader=DerivativeImageToNumpyLoader())
def load_reference_images(item: Item, _: Optional[Any] = None) -> list[MatLike]:
return _load_reference_images(item)
return load_reference_images
def build_target_sensitive_deriving_reference_image_loader() -> Callable[[Item, MatLike], list[MatLike]]:
"""Builds a data loader that applies derivation to reference images externally.
Since the data loader interface does not support passing target images, this applies derivation externally while
considering the target images. The disadvantage is, that this does not allow the data loader to cache the derived
images for a specific item ID, since the derivation happens outside the caching scope of the data loader, and the
derived images will be different for every target image anyway, so that the item ID is not a valid cache key in this
case.
"""
_load_reference_images = _build_reference_image_loader(data_loader=ImageToNumpyLoader())
def load_reference_images_and_derive_variants(item: Item, target_image: MatLike) -> list[MatLike]:
return lmapcat(
partial(derive_variants, target_image=target_image),
_load_reference_images(item),
)
return load_reference_images_and_derive_variants
load_reference_images = build_target_insensitive_deriving_reference_image_loader()