Source code for radiant_mlhub.models.collection

"""Extensions of the `PySTAC <https://pystac.readthedocs.io/en/latest/>`_ classes that provide convenience methods for interacting
with the `Radiant MLHub API <https://docs.mlhub.earth/#radiant-mlhub-api>`_."""

from __future__ import annotations

import os
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict, Iterable, Iterator, List, Optional, Union, cast

import pystac.catalog
import pystac.collection
import pystac.item
import pystac.link
import pystac.provider
import pystac.summaries

from .. import client
from ..exceptions import EntityDoesNotExist
from ..if_exists import DownloadIfExistsOpts

TagOrTagList = Union[str, Iterable[str]]
TextOrTextList = Union[str, Iterable[str]]


[docs]class Collection(pystac.collection.Collection): """Class inheriting from :class:`pystac.Collection` that adds some convenience methods for listing and fetching from the Radiant MLHub API. """ _archive_size: Optional[int] def __init__( self, id: str, description: str, extent: pystac.collection.Extent, title: Optional[str] = None, stac_extensions: Optional[List[str]] = None, href: Optional[str] = None, extra_fields: Optional[Dict[str, Any]] = None, catalog_type: Optional[pystac.catalog.CatalogType] = None, license: str = "proprietary", keywords: Optional[List[str]] = None, providers: Optional[List[pystac.provider.Provider]] = None, summaries: Optional[pystac.summaries.Summaries] = None, *, api_key: Optional[str] = None, profile: Optional[str] = None, ): super().__init__(id, description, extent, title=title, stac_extensions=stac_extensions, href=href, extra_fields=extra_fields, catalog_type=catalog_type, license=license, keywords=keywords, providers=providers, summaries=summaries) self.session_kwargs = {} if api_key is not None: self.session_kwargs['api_key'] = api_key if profile is not None: self.session_kwargs['profile'] = profile # Use -1 here instead of None because None represents the case where the archive does not # exist (HEAD returns a 404). self._archive_size = -1
[docs] @classmethod def list(cls, *, api_key: Optional[str] = None, profile: Optional[str] = None) -> List['Collection']: """Returns a list of :class:`Collection` instances for all collections hosted by MLHub. See the :ref:`Authentication` documentation for details on how authentication is handled for this request. Parameters ---------- api_key : str An API key to use for this request. This will override an API key set in a profile on using an environment variable profile: str A profile to use when making this request. Returns ------ collections : List[Collection] """ return [ cls.from_dict(collection) for collection in client.list_collections(api_key=api_key, profile=profile) ]
[docs] @classmethod def from_dict( cls, d: Dict[str, Any], href: Optional[str] = None, root: Optional[pystac.catalog.Catalog] = None, migrate: bool = False, preserve_dict: bool = True, *, api_key: Optional[str] = None, profile: Optional[str] = None ) -> "Collection": """Patches the :meth:`pystac.Collection.from_dict` method so that it returns the calling class instead of always returning a :class:`pystac.Collection` instance.""" catalog_type = pystac.catalog.CatalogType.determine_type(d) d = deepcopy(d) id_ = d.pop('id') description = d.pop('description') license_ = d.pop('license') extent = pystac.collection.Extent.from_dict(d.pop('extent')) title = d.get('title') stac_extensions = d.get('stac_extensions') keywords = d.get('keywords') providers = d.get('providers') if providers is not None: providers = list(map( lambda x: cast(object, pystac.provider.Provider.from_dict(x)), providers )) summaries = d.get('summaries') links = d.pop('links') d.pop('stac_version') collection = cls( id=id_, description=description, extent=extent, title=title, stac_extensions=stac_extensions, extra_fields=d, license=license_, keywords=keywords, providers=providers, summaries=summaries, href=href, catalog_type=catalog_type, api_key=api_key, profile=profile ) for link in links: if link['rel'] == 'root': # Remove the link that's generated in Catalog's constructor. collection.remove_links('root') if link['rel'] != 'self' or href is None: collection.add_link(pystac.link.Link.from_dict(link)) return collection
[docs] @classmethod def fetch(cls, collection_id: str, *, api_key: Optional[str] = None, profile: Optional[str] = None) -> 'Collection': """Creates a :class:`Collection` instance by fetching the collection with the given ID from the Radiant MLHub API. Parameters ---------- collection_id : str The ID of the collection to fetch (e.g. ``bigearthnet_v1_source``). api_key : str An API key to use for this request. This will override an API key set in a profile on using an environment variable profile: str A profile to use when making this request. Returns ------- collection : Collection """ response = client.get_collection(collection_id, api_key=api_key, profile=profile) return cls.from_dict(response, api_key=api_key, profile=profile)
def __str__(self) -> str: """Return the "informal" or nicely printable string representation of an object.""" return f'{self.id}: {self.description}'
[docs] def get_items(self, *, api_key: Optional[str] = None, profile: Optional[str] = None) -> Iterator[pystac.item.Item]: """ .. note:: The ``get_items`` method is not implemented for Radiant MLHub :class:`Collection` instances for performance reasons. Please use the :meth:`Dataset.download` method to download Dataset assets. Raises ------ NotImplementedError """ raise NotImplementedError('For performance reasons, the get_items method has not been implemented for Collection instances. Please ' 'use the Dataset.download method to download Dataset assets.')
[docs] def fetch_item(self, item_id: str, *, api_key: Optional[str] = None, profile: Optional[str] = None) -> pystac.item.Item: api_key = api_key or self.session_kwargs.get("api_key") profile = profile or self.session_kwargs.get("profile") response = client.get_collection_item(self.id, item_id, api_key=api_key, profile=profile) return pystac.item.Item.from_dict(response)
[docs] def download( self, output_dir: Union[str, Path], *, if_exists: DownloadIfExistsOpts = DownloadIfExistsOpts.resume, api_key: Optional[str] = None, profile: Optional[str] = None ) -> Path: """Downloads the archive for this collection to an output location (current working directory by default). If the parent directories for ``output_path`` do not exist, they will be created. The ``if_exists`` argument determines how to handle an existing archive file in the output directory. See the documentation for the :func:`~radiant_mlhub.client.download_archive` function for details. The default behavior is to resume downloading if the existing file is incomplete and skip the download if it is complete. .. note:: Some collections may be very large and take a significant amount of time to download, depending on your connection speed. Parameters ---------- output_dir : Path Path to a local directory to which the file will be downloaded. File name will be generated automatically based on the download URL. if_exists : str, optional How to handle an existing archive at the same location. If ``"skip"``, the download will be skipped. If ``"overwrite"``, the existing file will be overwritten and the entire file will be re-downloaded. If ``"resume"`` (the default), the existing file size will be compared to the size of the download (using the ``Content-Length`` header). If the existing file is smaller, then only the remaining portion will be downloaded. Otherwise, the download will be skipped. api_key : str An API key to use for this request. This will override an API key set in a profile on using an environment variable profile: str A profile to use when making this request. Returns ------- output_path : pathlib.Path The path to the downloaded archive file. Raises ------ FileExistsError If file at ``output_path`` already exists and both ``exist_okay`` and ``overwrite`` are ``False``. """ session_kwargs = { **self.session_kwargs, "api_key": api_key, "profile": profile } return client.download_collection_archive(self.id, output_dir=os.fspath(output_dir), if_exists=if_exists, **session_kwargs)
@property def registry_url(self) -> Optional[str]: """The URL of the registry page for this Collection. The URL is based on the DOI identifier for the collection. If the Collection does not have a ``"sci:doi"`` property then ``registry_url`` will be ``None``.""" # Some Collections don't publish the "scientific" extension in their "stac_extensions" # attribute so we access this via "extra_fields" rather than through self.ext["scientific"]. doi = self.extra_fields.get("sci:doi") if doi is None: return None return f'https://mlhub.earth/{doi}' @property def archive_size(self) -> Optional[int]: """The size of the tarball archive for this collection in bytes (or ``None`` if the archive does not exist).""" # Use -1 here instead of None because None represents the case where the archive does not # exist (HEAD returns a 404). if self._archive_size == -1: try: self._archive_size = client.get_collection_archive_info(self.id, **self.session_kwargs).get('size') except EntityDoesNotExist: self._archive_size = None return self._archive_size