Source code for radiant_mlhub.models.dataset

"""Extensions of the `PySTAC <https://pystac.readthedocs.io/en/latest/>`_ classes that provide convenience methods for interacting
with the `Radiant MLHub API <https://docs.mlhub.earth/#radiant-mlhub-api>`_."""

from __future__ import annotations

from pathlib import Path
import concurrent.futures
from datetime import datetime

from enum import Enum
from typing import Any, Dict, Iterable, Iterator, List, Optional, Union, Tuple

from ..session import get_session

from .. import client
from . import Collection
from ..client import CatalogDownloader, CatalogDownloaderConfig
from ..if_exists import DownloadIfExistsOpts
TagOrTagList = Union[str, Iterable[str]]
TextOrTextList = Union[str, Iterable[str]]

JsonDict = Dict[str, Any]
GeoJSON = JsonDict


[docs]class Dataset: """Class that brings together multiple Radiant MLHub "collections" that are all considered part of a single "dataset". For instance, the ``bigearthnet_v1`` dataset is composed of both a source imagery collection (``bigearthnet_v1_source``) and a labels collection (``bigearthnet_v1_labels``). Attributes ---------- id : str The dataset ID. title : str or None The title of the dataset (or ``None`` if dataset has no title). registry_url : str or None The URL to the registry page for this dataset, or ``None`` if no registry page exists. doi : str or None The DOI identifier for this dataset, or ``None`` if there is no DOI for this dataset. citation: str or None The citation information for this dataset, or ``None`` if there is no citation information. """ def __init__( self, id: str, collections: List[Dict[str, Any]], title: Optional[str] = None, registry: Optional[str] = None, doi: Optional[str] = None, citation: Optional[str] = None, *, api_key: Optional[str] = None, profile: Optional[str] = None, # Absorbs additional keyword arguments to protect against changes to dataset object from API # https://github.com/radiantearth/radiant-mlhub/issues/41 **_: Any ): self.id = id self.title = title self.collection_descriptions = collections self.registry_url = registry self.doi = doi self.citation = citation self.session_kwargs = {} if api_key: self.session_kwargs['api_key'] = api_key if profile: self.session_kwargs['profile'] = profile self._collections: Optional['_CollectionList'] = None @property def collections(self) -> _CollectionList: """List of collections associated with this dataset. The list that is returned has 2 additional attributes (``source_imagery`` and ``labels``) that represent the list of collections corresponding the each type. .. note:: This is a cached property, so updating ``self.collection_descriptions`` after calling ``self.collections`` the first time will have no effect on the results. See :func:`functools.cached_property` for details on clearing the cached value. Examples -------- >>> from radiant_mlhub import Dataset >>> dataset = Dataset.fetch('bigearthnet_v1') >>> len(dataset.collections) 2 >>> len(dataset.collections.source_imagery) 1 >>> len(dataset.collections.labels) 1 To loop through all collections >>> for collection in dataset.collections: ... # Do something here To loop through only the source imagery collections: >>> for collection in dataset.collections.source_imagery: ... # Do something here To loop through only the label collections: >>> for collection in dataset.collections.labels: ... # Do something here """ if self._collections is None: # Internal method to return a Collection along with it's CollectionType def _fetch_collection(_collection_description: Dict[str, Any]) -> _CollectionWithType: return _CollectionWithType( Collection.fetch(_collection_description['id'], **self.session_kwargs), [CollectionType(type_) for type_ in _collection_description['types']] ) # Fetch all collections and create Collection instances if len(self.collection_descriptions) == 1: # If there is only 1 collection, fetch it in the same thread only_description = self.collection_descriptions[0] collections = [_fetch_collection(only_description)] else: # If there are multiple collections, fetch them concurrently with concurrent.futures.ThreadPoolExecutor() as exc: collections = list(exc.map(_fetch_collection, self.collection_descriptions)) self._collections = _CollectionList(collections) return self._collections
[docs] @classmethod def list( cls, *, tags: Optional[TagOrTagList] = None, text: Optional[TextOrTextList] = None, api_key: Optional[str] = None, profile: Optional[str] = None ) -> List['Dataset']: """Returns a list of :class:`Dataset` instances for each datasets hosted by MLHub. See the :ref:`Authentication` documentation for details on how authentication is handled for this request. Parameters ---------- tags : A list of tags to filter datasets by. If not ``None``, only datasets containing all provided tags will be returned. text : A list of text phrases to filter datasets by. If not ``None``, only datasets containing all phrases will be returned. api_key : str An API key to use for this request. This will override an API key set in a profile on using an environment variable profile: str A profile to use when making this request. Yields ------ dataset : Dataset """ return [ cls(**d, api_key=api_key, profile=profile) for d in client.list_datasets(tags=tags, text=text, api_key=api_key, profile=profile) ]
[docs] @classmethod def fetch_by_doi(cls, dataset_doi: str, *, api_key: Optional[str] = None, profile: Optional[str] = None) -> "Dataset": """Creates a :class:`Dataset` instance by fetching the dataset with the given DOI from the Radiant MLHub API. Parameters ---------- dataset_doi : str The DOI of the dataset to fetch (e.g. ``10.6084/m9.figshare.12047478.v2``). api_key : str An API key to use for this request. This will override an API key set in a profile on using an environment variable profile: str A profile to use when making this request. Returns ------- dataset : Dataset """ return cls( **client.get_dataset_by_doi(dataset_doi, api_key=api_key, profile=profile), api_key=api_key, profile=profile, )
[docs] @classmethod def fetch_by_id(cls, dataset_id: str, *, api_key: Optional[str] = None, profile: Optional[str] = None) -> 'Dataset': """Creates a :class:`Dataset` instance by fetching the dataset with the given ID from the Radiant MLHub API. Parameters ---------- dataset_id : str The ID of the dataset to fetch (e.g. ``bigearthnet_v1``). api_key : str An API key to use for this request. This will override an API key set in a profile on using an environment variable profile: str A profile to use when making this request. Returns ------- dataset : Dataset """ return cls( **client.get_dataset_by_id( dataset_id, api_key=api_key, profile=profile ) )
[docs] @classmethod def fetch(cls, dataset_id_or_doi: str, *, api_key: Optional[str] = None, profile: Optional[str] = None) -> 'Dataset': """Creates a :class:`Dataset` instance by first trying to fetching the dataset based on ID, then falling back to fetching by DOI. Parameters ---------- dataset_id_or_doi : str The ID or DOI of the dataset to fetch (e.g. ``bigearthnet_v1``). api_key : str An API key to use for this request. This will override an API key set in a profile on using an environment variable profile: str A profile to use when making this request. Returns ------- dataset : Dataset """ return cls( **client.get_dataset(dataset_id_or_doi, api_key=api_key, profile=profile), api_key=api_key, profile=profile, )
def __str__(self) -> str: """Return the "informal" or nicely printable string representation of an object.""" return f'{self.id}: {self.title}' @property def stac_catalog_size(self) -> Optional[int]: """Size of the dataset_id.tar.gz STAC archive (bytes)""" info = client.get_catalog_info(self.id, **self.session_kwargs) return info.get('stac_catalog_size', None) @property def estimated_dataset_size(self) -> Optional[int]: """Size in bytes of entire dataset (bytes)""" info = client.get_catalog_info(self.id, **self.session_kwargs) return info.get('estimated_dataset_size', None)
[docs] def download( self, output_dir: Union[Path, str] = Path.cwd(), *, catalog_only: bool = False, if_exists: DownloadIfExistsOpts = DownloadIfExistsOpts.resume, api_key: Optional[str] = None, profile: Optional[str] = None, bbox: Optional[List[float]] = None, intersects: Optional[GeoJSON] = None, datetime: Optional[Union[datetime, Tuple[datetime, datetime]]] = None, collection_filter: Optional[Dict[str, List[str]]] = None, ) -> None: """ Downloads dataset's STAC catalog and all linked assets. The download may be customized and controlled by providing bbox, intersects, datetime, and filter options. Parameters ---------- output_dir: str or pathlib.Path The directory into which the archives will be written. Defaults to current working directory. catalog_only: bool If True, the STAC catalog will be downloaded and unarchived, but no assets wll be downloaded. Defaults to False. if_exists : str, optional Allowed values: `skip`, `overwrite`, or `resume` (default). bbox: Optional[List[float]] List representing a bounding box of coordinates, for spatial intersection filter. Must be in CRS EPSG:4326. intersects: Optional[GeoJSON] GeoJSON object for spatial intersects filter. Must be a parsed GeoJSON dict with a `geometry` property. datetime: Optional[datetime, Tuple[datetime, datetime]] Single datetime or datetime range for temporal filter. collection_filter: Optional[Dict[str, list]] Mapping of collection_id and asset keys to include (exclusively). examples: * download will only include this collection: `dict(ref_landcovernet_sa_v1_source_sentinel_2=[])` * download will only include this collection and only these asset keys: `dict(ref_landcovernet_sa_v1_source_sentinel_2=["B02", "B03", "B04"])` api_key: Optional[str] An API key to use for this request. This will override an API key set in a profile on using an environment variable. profile: Optional[str] Authentication Profile to use when making this request. Raises ------- IOError If ``output_dir`` exists and is not a directory. If unrecoverable download errors occurred. ValueError If provided filters are incompatible, for example bbox and intersects. RuntimeError If filters result in zero assets to download. Error Reporting --------------- Any unrecoverable download errors will be logged to `{output_dir}/{dataset_id}/err_report.csv`. """ assert output_dir output_path = Path(output_dir) if output_path.exists() and not output_path.is_dir(): raise IOError('output_dir is not directory.') output_path.mkdir(exist_ok=True, parents=True) config = CatalogDownloaderConfig( catalog_only=catalog_only, api_key=api_key, bbox=bbox, dataset_id=self.id, collection_filter=collection_filter, if_exists=if_exists, intersects=intersects, output_dir=output_path, profile=profile, session=get_session(api_key=api_key, profile=profile), temporal_query=datetime, ) dl = CatalogDownloader(config=config) dl()
[docs]class CollectionType(Enum): """Valid values for the type of a collection associated with a Radiant MLHub dataset.""" SOURCE = 'source_imagery' LABELS = 'labels'
class _CollectionWithType: def __init__(self, collection: Collection, types: List[CollectionType]): self.types = [CollectionType(type_) for type_ in types] self.collection = collection class _CollectionList: """Used internally by :class:`Dataset` to create a list of collections that can also be accessed by type using the ``source_imagery`` and ``labels`` attributes.""" _source_imagery: Optional[List[Collection]] _labels: Optional[List[Collection]] _collections: List[_CollectionWithType] def __init__(self, collections_with_type: List[_CollectionWithType]): self._collections = collections_with_type self._source_imagery = None self._labels = None def __iter__(self) -> Iterator[Collection]: for item in self._collections: yield item.collection def __len__(self) -> int: return len(self._collections) def __getitem__(self, item: int) -> Collection: return self._collections[item].collection def __repr__(self) -> str: return list(self.__iter__()).__repr__() @property def source_imagery(self) -> List[Collection]: if self._source_imagery is None: self._source_imagery = [ c.collection for c in self._collections if any(type_ is CollectionType.SOURCE for type_ in c.types) ] return self._source_imagery @property def labels(self) -> List[Collection]: if self._labels is None: self._labels = [ c.collection for c in self._collections if any(type_ is CollectionType.LABELS for type_ in c.types) ] return self._labels