diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 6f4075a..e1d0793 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,12 +12,12 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", 3.11, 3.12, 3.13] + python-version: ["3.10", 3.11, 3.12, 3.13, 3.14] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -26,4 +26,4 @@ jobs: pip install .[test] - name: Test with pytest run: | - pytest + pytest -m 'not with_catalog and not with_internet' diff --git a/CHANGELOG.md b/CHANGELOG.md index 236efef..66c0d97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,19 @@ # Changes -## [1.14.3] +## unreleased - Fixed issue where `makecldf` could not be run on a dataset in a git repos with no commits. +- Drop py3.8 compat. +- Removed dependency on requests and attrs. + +Note: Functionality requiring `pyglottolog` or `pyconcepticon` will only work once versions of +these packages are released which are compatible with `clldutils` 4.x. + +### Backwards incompatible changes + +- removed `utils.get_url` function. +- `metadata.Metadata` is no longer an `attrs`-decorated class, so inheriting classes (to implement + custom scaffold metadata) must be changed to `dataclasses`. - Pin dependencies for packages which are about to get incompatible new major versions. - Last version of the 1.x series. diff --git a/RELEASING.md b/RELEASING.md index bf664a7..b30aece 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -9,6 +9,10 @@ ```shell flake8 src ``` +- Make sure pylint passes with a score of 10: + ```shell + pylint src + ``` - Make sure the docs render: ```shell diff --git a/setup.cfg b/setup.cfg index 9ee8f73..87247d1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,12 +18,12 @@ classifiers = Natural Language :: English Operating System :: OS Independent Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 Programming Language :: Python :: 3.13 + Programming Language :: Python :: 3.14 Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy License :: OSI Approved :: Apache Software License @@ -32,19 +32,16 @@ classifiers = packages = find: package_dir = = src -python_requires = >=3.8 +python_requires = >=3.9 install_requires = # Pin until 2.0.1 is released, see https://github.com/python-hyper/rfc3986/issues/107 rfc3986<2 - csvw<4 - clldutils<4 + csvw>=4.0 + clldutils>=4.0 cldfcatalog>=1.5.1 - pycldf<2 + pycldf>=2.0 termcolor - requests - appdirs pytest - zenodoclient>=0.3 simplepybtex tqdm @@ -99,6 +96,9 @@ max-line-length = 100 exclude = .tox [tool:pytest] +markers = + with_catalog: mark a test requiring a catalog (Glottolog or Concepticon). + with_internet: test requiring an internet connection. minversion = 5 testpaths = tests addopts = --cov @@ -116,10 +116,10 @@ show_missing = true skip_covered = true [tox:tox] -envlist = py38, py39, py310, py311, py312, py313 +envlist = py39, py310, py311, py312, py313, py314 isolated_build = true skip_missing_interpreter = true [testenv] deps = .[test] -commands = pytest {posargs} +commands = pytest -m 'not with_catalog' {posargs} diff --git a/src/cldfbench/__init__.py b/src/cldfbench/__init__.py index 469bec0..4723f24 100644 --- a/src/cldfbench/__init__.py +++ b/src/cldfbench/__init__.py @@ -1,4 +1,10 @@ -# +""" +The cldfbench package. + +.. seealso:: + + https://aclanthology.org/anthology-files/anthology-files/pdf/lrec/2020.lrec-1.864.pdf +""" from cldfbench.dataset import * # noqa: F401, F403 from cldfbench.cldf import * # noqa: F401, F403 from cldfbench.metadata import * # noqa: F401, F403 diff --git a/src/cldfbench/__main__.py b/src/cldfbench/__main__.py index 930183a..256696f 100644 --- a/src/cldfbench/__main__.py +++ b/src/cldfbench/__main__.py @@ -8,7 +8,9 @@ """ import csv import sys +import argparse import contextlib +from typing import Optional from clldutils.clilib import ( register_subcommands, get_parser_and_subparsers, ParserError, add_csv_field_size_limit, @@ -16,16 +18,49 @@ ) from clldutils.loglib import Logging from cldfcatalog import Config -import termcolor -import argparse import cldfbench from cldfbench.catalogs import BUILTIN_CATALOGS from cldfbench.cli_util import IGNORE_MISSING +from cldfbench.util import colored import cldfbench.commands -def main(args=None, catch_all=False, parsed_args=None, log=None): +def print_red(text, **kw): # pylint: disable=C0116 + print(colored('red', text, **kw)) + + +def _add_catalog( + cls: type, + cfg: Config, + args: argparse.Namespace, + stack: contextlib.ExitStack, +) -> tuple[Optional[Exception], bool]: + """Catalogs are context managers, so they have to be added to the exit stack.""" + name = cls.cli_name() + if not hasattr(args, name): + return None, False + path = getattr(args, name) + from_cfg = False + if path != IGNORE_MISSING: + if (not path) and (not args.no_config): + try: + path = cfg.get_clone(name) + from_cfg = True + except KeyError as e: # pragma: no cover + return e, False + try: + version = getattr(args, name + '_version', None) + setattr(args, name, stack.enter_context(cls(path, version))) + assert getattr(args, name).api + except ValueError as e: + return e, from_cfg + else: + setattr(args, name, None) # pragma: no cover + return None, False + + +def main(args=None, catch_all=False, parsed_args=None, log=None): # pylint: disable=C0116,R0911 parser, subparsers = get_parser_and_subparsers(cldfbench.__name__) # We add a "hidden" option to turn-off config file reading in tests: @@ -42,6 +77,10 @@ def main(args=None, catch_all=False, parsed_args=None, log=None): parser.print_help() return 1 + def cmd_help(err): + print_red(err + '\n', attrs={'bold'}) + return main([args._command, '-h']) # pylint: disable=W0212 + with contextlib.ExitStack() as stack: if not log: # pragma: no cover stack.enter_context(Logging(args.log, level=args.log_level)) @@ -54,47 +93,25 @@ def main(args=None, catch_all=False, parsed_args=None, log=None): for cls in BUILTIN_CATALOGS: # Now we loop over known catalogs, see whether they are used by the command, # and if so, "enter" the catalog. - name, from_cfg = cls.cli_name(), False - if hasattr(args, name): - # If no path was passed on the command line, we look up the config: - path = getattr(args, name) - if path != IGNORE_MISSING: - if (not path) and (not args.no_config): - try: - path = cfg.get_clone(name) - from_cfg = True - except KeyError as e: # pragma: no cover - print(termcolor.colored(str(e) + '\n', 'red')) - return main([args._command, '-h']) - try: - setattr( - args, - name, - stack.enter_context( - cls(path, getattr(args, name + '_version', None))), - ) - assert getattr(args, name).api - except ValueError as e: - print(termcolor.colored( - '\nError initializing catalog {0}'.format(name), 'red')) - if from_cfg: - print( - termcolor.colored('from config {0}'.format(cfg.fname()), 'red')) - print(termcolor.colored(str(e) + '\n', 'red')) - return main([args._command, '-h']) - else: - setattr(args, name, None) # pragma: no cover + e, from_cfg = _add_catalog(cls, cfg, args, stack) + if isinstance(e, KeyError): # pragma: no cover + return cmd_help(str(e)) + if isinstance(e, ValueError): + print_red(f'\nError initializing catalog {cls.cli_name()}') + if from_cfg: + print_red(f'from config {cfg.fname()}') + return cmd_help(str(e)) + assert e is None try: return args.main(args) or 0 except KeyboardInterrupt: # pragma: no cover return 0 except ParserError as e: - print(termcolor.colored('ERROR: {}\n'.format(e), 'red', attrs={'bold'})) - return main([args._command, '-h']) + return cmd_help(f'ERROR: {e}') except Exception as e: if catch_all: # pragma: no cover - print(termcolor.colored('ERROR: {}\n'.format(e), 'red', attrs={'bold'})) + print_red(f'ERROR: {e}\n', attrs={'bold'}) return 1 raise diff --git a/src/cldfbench/_compat.py b/src/cldfbench/_compat.py new file mode 100644 index 0000000..f246f28 --- /dev/null +++ b/src/cldfbench/_compat.py @@ -0,0 +1,24 @@ +""" +Backwards compatibility with supported python versions. +""" +import sys +import datetime +import functools + + +if (sys.version_info.major, sys.version_info.minor) >= (3, 10): # pragma: no cover + def entry_points_select(eps, group): + """ + Staring with Python 3.10, `importlib.metadata.entry_points` returns `EntryPoints`.""" + return eps.select(group=group) +else: + def entry_points_select(eps, group): # pragma: no cover + """In Python 3.9, `importlib.metadata.entry_points` returns a `dict`.""" + return eps.get(group, []) + + +if (sys.version_info.major, sys.version_info.minor) >= (3, 11): # pragma: no cover + # datetime.UTC was added in py3.11. + utcnow = functools.partial(datetime.datetime.now, datetime.UTC) +else: # pragma: no cover + utcnow = datetime.datetime.utcnow diff --git a/src/cldfbench/catalogs.py b/src/cldfbench/catalogs.py index 8f766df..dfae6f6 100644 --- a/src/cldfbench/catalogs.py +++ b/src/cldfbench/catalogs.py @@ -7,10 +7,10 @@ - support to access the Python API for each catalog from the `Catalog` object, - automatic registration of catalogs as provenance information when writing CLDF. """ -import typing +from typing import Union, Optional +import functools from cldfcatalog import Catalog -from clldutils.misc import lazyproperty try: # pragma: no cover import pyglottolog @@ -18,39 +18,39 @@ from pyglottolog.config import Macroarea class CachingGlottologAPI(pyglottolog.Glottolog): + """Wraps Glottolog to avoid expensive lookups.""" def __init__(self, p): super().__init__(p) self.__languoids = None - def languoids(self, **kw): + def languoids(self, *args, **kw): # pylint: disable=C0116 if not kw: if not self.__languoids: self.__languoids = list(super().languoids()) return self.__languoids - return super().languoids(**kw) + return super().languoids(*args, **kw) - @lazyproperty - def cached_languoids(self) -> typing.Dict[str, Languoid]: + @functools.cached_property + def cached_languoids(self) -> dict[str, Languoid]: # pylint: disable=C0116 return {lang.id: lang for lang in self.languoids()} - @lazyproperty - def languoid_details(self) -> typing.Dict[str, typing.Tuple]: + @functools.cached_property + def languoid_details(self) -> dict[str, tuple[str, list, str]]: # pylint: disable=C0116 return {lid: (l.iso, l.macroareas, l.name) for lid, l in self.cached_languoids.items()} - @lazyproperty - def glottocode_by_name(self) -> typing.Dict[str, str]: + @functools.cached_property + def glottocode_by_name(self) -> dict[str, str]: # pylint: disable=C0116 return {l[2]: lid for lid, l in self.languoid_details.items()} - @lazyproperty - def glottocode_by_iso(self) -> typing.Dict[str, str]: + @functools.cached_property + def glottocode_by_iso(self) -> dict[str, str]: # pylint: disable=C0116 return {l[0]: lid for lid, l in self.languoid_details.items() if l[0]} - @lazyproperty - def macroareas_by_glottocode(self) -> typing.Dict[str, typing.List[Macroarea]]: + @functools.cached_property + def macroareas_by_glottocode(self) -> dict[str, list[Macroarea]]: # pylint: disable=C0116 return {lid: l[1] for lid, l in self.languoid_details.items()} - def get_language(self, languoid: typing.Union[str, Languoid]) \ - -> typing.Union[Languoid, None]: + def get_language(self, languoid: Union[str, Languoid]) -> Optional[Languoid]: """ :param languoid: A languoid specified via Glottocode or passed as `Languoid` instance. :return: Language-level languoid associated with `languoid` or `None` if `languoid` is \ @@ -59,34 +59,36 @@ def get_language(self, languoid: typing.Union[str, Languoid]) \ if isinstance(languoid, str): languoid = self.cached_languoids[languoid] if languoid.level == self.languoid_levels.family: - return + return None if languoid.level == self.languoid_levels.language: return languoid for _, gc, _ in reversed(languoid.lineage): parent = self.cached_languoids[gc] if parent.level == self.languoid_levels.language: return parent + return None except ImportError: # pragma: no cover - CachingGlottologAPI = pyglottolog = 'pyglottolog' + CachingGlottologAPI = pyglottolog = 'pyglottolog' # pylint: disable=invalid-name try: # pragma: no cover import pyconcepticon class CachingConcepticonAPI(pyconcepticon.Concepticon): - @lazyproperty - def cached_glosses(self): + """Wraps Concepticon to avoid expensive file reads.""" + @functools.cached_property + def cached_glosses(self) -> dict[int, str]: # pylint: disable=C0116 return {int(cs.id): cs.gloss for cs in self.conceptsets.values()} except ImportError: # pragma: no cover - CachingConcepticonAPI = pyconcepticon = 'pyconcepticon' + CachingConcepticonAPI = pyconcepticon = 'pyconcepticon' # pylint: disable=invalid-name try: # pragma: no cover import pyclts class CLTSAPI(pyclts.api.CLTS): - pass + """Cross-Linguistic Transcription Systems API.""" except ImportError: # pragma: no cover CLTSAPI = pyclts = 'pyclts' diff --git a/src/cldfbench/ci.py b/src/cldfbench/ci.py index 9f440db..8fe8a45 100644 --- a/src/cldfbench/ci.py +++ b/src/cldfbench/ci.py @@ -19,12 +19,12 @@ runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9] + python-version: [3.12] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -46,14 +46,15 @@ def build_status_badge(dataset): """ if dataset.repo and dataset.repo.github_repo: # pragma: no cover if dataset.dir.joinpath('.github/workflows', CONFIG_FNAME).exists(): - return "[![CLDF validation]" \ - "(https://github.com/{0}/workflows/CLDF-validation/badge.svg)]" \ - "(https://github.com/{0}/actions?query=workflow%3ACLDF-validation)".format( - dataset.repo.github_repo) + repo_url = f"https://github.com/{dataset.repo.github_repo}" + return f"[![CLDF validation]" \ + f"({repo_url}/workflows/CLDF-validation/badge.svg)]" \ + f"({repo_url}/actions?query=workflow%3ACLDF-validation)" return '' -def setup(dataset, force=False): +def setup(dataset, force=False) -> bool: + """Tries to write a CLDF test workflow to the .github directory.""" yml = dataset.dir / '.github' / 'workflows' / CONFIG_FNAME if ((not dataset.repo) or (not dataset.repo.github_repo) or yml.exists()) and not force: return False # pragma: no cover @@ -68,8 +69,8 @@ def setup(dataset, force=False): tests = [] for spec in dataset.cldf_specs_dict.values(): - tests.append(' pytest --cldf-metadata={} test.py'.format( - dataset.cldf_dir.relative_to(dataset.dir) / spec.metadata_fname)) + rel_md_path = dataset.cldf_dir.relative_to(dataset.dir) / spec.metadata_fname + tests.append(f' pytest --cldf-metadata={rel_md_path} test.py') yml.write_text(CONFIG_YML % (branch, branch, '\n'.join(tests)), encoding='utf8') return True diff --git a/src/cldfbench/cldf.py b/src/cldfbench/cldf.py index 02b5477..b625db0 100644 --- a/src/cldfbench/cldf.py +++ b/src/cldfbench/cldf.py @@ -1,13 +1,17 @@ +""" +Functionality to be plugged into cldfbench datasets to make writing of CLDF datasets easier. +""" import sys import shutil import pathlib -import warnings +import argparse import collections +import dataclasses +from typing import Optional, Union -import attr -from csvw.metadata import Link +from csvw.metadata import Link, Table, Column import pycldf -from pycldf.dataset import get_modules, MD_SUFFIX, Dataset +from pycldf.dataset import get_module_impl, get_modules, MD_SUFFIX, Dataset, SchemaObjectType from pycldf.util import pkg_path from cldfcatalog import Repository @@ -15,10 +19,9 @@ from cldfbench.util import iter_requirements __all__ = ['CLDFWriter', 'CLDFSpec'] -WITH_ZIPPED = tuple(map(int, pycldf.__version__.split('.')[:2])) >= (1, 29) -class CLDFWriter(object): +class CLDFWriter: """ An object mediating writing data as proper CLDF dataset. @@ -36,15 +39,19 @@ class CLDFWriter(object): >>> with Writer(cldf_spec) as writer: ... writer.objects['ValueTable'].append(...) """ - def __init__(self, cldf_spec=None, args=None, dataset=None, clean=True): + def __init__(self, + cldf_spec: Optional['CLDFSpec'] = None, + args: argparse.Namespace = None, + dataset: Optional[pycldf.Dataset] = None, + clean: bool = True): """ :param cldf_spec: `CLDFSpec` instance :param args: `argparse.Namespace`, passed if the writer is instantiated from a cli command. :param dataset: `cldfbench.Dataset`, passed if instantiated from a dataset method. :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. """ - self.cldf_spec = cldf_spec or CLDFSpec(dir=getattr(dataset, 'cldf_dir', '.')) - self.objects = collections.defaultdict(list) + self.cldf_spec: CLDFSpec = cldf_spec or CLDFSpec(dir=getattr(dataset, 'cldf_dir', '.')) + self.objects: dict[str, list] = collections.defaultdict(list) self.args = args self.dataset = dataset self._cldf = None @@ -61,7 +68,7 @@ def cldf(self) -> pycldf.Dataset: raise AttributeError('Writer.cldf is only set when Writer is used in with statement!') return self._cldf - def __getitem__(self, type_): + def __getitem__(self, type_: SchemaObjectType) -> Union[Table, Column]: """ Mirrors `pycldf.Dataset.__getitem__` """ @@ -99,41 +106,50 @@ def __exit__(self, exc_type, exc_val, exc_tb): """ When exiting the writer context, write data (and metadata) to disk. """ - if WITH_ZIPPED: - self.write(zipped=self.cldf_spec.zipped, **self.objects) - else: # pragma: no cover - self.write(**self.objects) - - def write(self, **kw): - self.cldf.properties.setdefault('rdf:type', 'http://www.w3.org/ns/dcat#Distribution') + self.write(zipped=self.cldf_spec.zipped, **self.objects) + + @staticmethod + def _get_sources( + dataset: Optional[Dataset], + args: argparse.Namespace, + props: dict, + ) -> list[dict]: srcs = [] # Let's see whether self.dataset is repository: - if self.dataset: - self.cldf.properties.setdefault('rdf:ID', self.dataset.id) - for k, v in self.dataset.metadata.common_props().items(): - self.cldf.properties.setdefault(k, v) - if self.dataset.repo: - if self.dataset.repo.url: - self.cldf.properties.setdefault('dcat:accessURL', self.dataset.repo.url) + if dataset: + props.setdefault('rdf:ID', dataset.id) + for k, v in dataset.metadata.common_props().items(): + props.setdefault(k, v) + if dataset.repo: + if dataset.repo.url: + props.setdefault('dcat:accessURL', dataset.repo.url) try: - srcs.append(self.dataset.repo.json_ld()) - except: # pragma: no cover # noqa: E722 + srcs.append(dataset.repo.json_ld()) + except: # pragma: no cover # noqa: E722 # pylint: disable=W0702 # If a git repository has no commit, git describe fails. pass - if self.args: + if args: # We inspect the cli arguments to see whether some `Catalog`'s were used. - for cat in vars(self.args).values(): + for cat in vars(args).values(): if isinstance(cat, Catalog): srcs.append(cat.json_ld()) # And check, whether any repositories have been "mounted" via git submodules in raw/: - if self.dataset and self.dataset.raw_dir.exists(): - for p in self.dataset.raw_dir.iterdir(): + if dataset and dataset.raw_dir.exists(): + for p in dataset.raw_dir.iterdir(): if p.is_dir(): try: repo = Repository(p) except ValueError: continue srcs.append(repo.json_ld()) + return srcs + + def write(self, **kw): + """ + Write the data specified as lists of rows according to the metadata. + """ + self.cldf.properties.setdefault('rdf:type', 'http://www.w3.org/ns/dcat#Distribution') + srcs = self._get_sources(self.dataset, self.args, self.cldf.properties) if srcs: self.cldf.add_provenance(wasDerivedFrom=srcs) reqs = [ @@ -153,8 +169,8 @@ def write(self, **kw): self.cldf.write(**kw) -@attr.s -class CLDFSpec(object): +@dataclasses.dataclass +class CLDFSpec: """ Basic specification to initialize a CLDF Dataset. @@ -168,39 +184,38 @@ class CLDFSpec(object): :ivar zipped: An `iterable` listing component names or csv file names for which the \ corresponding tables should be zipped. """ - dir = attr.ib(converter=lambda s: pathlib.Path(s) if s else s) - module = attr.ib( - default='Generic', - converter=lambda cls: getattr(cls, '__name__', cls), - validator=attr.validators.in_([m.id for m in get_modules()]) - ) - default_metadata_path = attr.ib(default=None) - metadata_fname = attr.ib(default=None) - data_fnames = attr.ib(default=attr.Factory(dict)) - writer_cls = attr.ib(default=CLDFWriter) - zipped = attr.ib(default=attr.Factory(set)) - - def __attrs_post_init__(self): - if self.zipped and not WITH_ZIPPED: # pragma: no cover - warnings.warn('Writing zipped tables requires pycldf >= 1.29', category=UserWarning) + dir: pathlib.Path + module: str = 'Generic' + default_metadata_path: Optional[pathlib.Path] = None + metadata_fname: Optional[str] = None + data_fnames: Optional[dict[str, str]] = dataclasses.field(default_factory=dict) + writer_cls: type = CLDFWriter + zipped: Union[set[str], list[str]] = dataclasses.field(default_factory=set) + + def __post_init__(self): + self.dir = pathlib.Path(self.dir) + self.module = getattr(self.module, '__name__', self.module) + if self.module not in {m.id for m in get_modules()}: + raise ValueError(f'Invalid module: {self.module}') + if self.default_metadata_path: self.default_metadata_path = pathlib.Path(self.default_metadata_path) try: Dataset.from_metadata(self.default_metadata_path) - except Exception: - raise ValueError('invalid default metadata: {0}'.format(self.default_metadata_path)) + except Exception as e: + raise ValueError(f'invalid default metadata: {self.default_metadata_path}') from e else: - self.default_metadata_path = pkg_path( - 'modules', '{0}{1}'.format(self.module, MD_SUFFIX)) + self.default_metadata_path = pkg_path('modules', f'{self.module}{MD_SUFFIX}') if not self.metadata_fname: self.metadata_fname = self.default_metadata_path.name @property - def metadata_path(self): + def metadata_path(self) -> pathlib.Path: # pylint: disable=C0116 return (self.dir / self.metadata_fname) if self.dir else pathlib.Path(self.metadata_fname) def make_clean(self): + """Clean out the cldf directory (typically preparing a new run of `makecldf`).""" self.dir.mkdir(exist_ok=True) for p in self.dir.iterdir(): if p.is_file() and p.name not in ['.gitattributes', 'README.md']: @@ -211,17 +226,20 @@ def make_clean(self): fp.write('*.csv text eol=crlf') def copy_metadata(self): + """Copy the default metadata to the location specified in spec.""" shutil.copy(str(self.default_metadata_path), str(self.metadata_path)) - def get_dataset(self): - # Initialize a CLDF Dataset: + def get_dataset(self) -> pycldf.Dataset: + """Initialized CLDF Dataset""" return self.cls.from_metadata(self.metadata_path) - def get_writer(self, args=None, dataset=None, clean=True): + def get_writer(self, args=None, dataset=None, clean=True) -> CLDFWriter: + """An initialized CLDFWriter.""" return self.writer_cls(cldf_spec=self, args=args, dataset=dataset, clean=clean) @property - def cls(self): - for m in get_modules(): - if m.id == self.module: - return m.cls + def cls(self) -> type: + """A suitable Dataset subclass to represent the module.""" + res = get_module_impl(Dataset, self.module) + assert res, self.module + return res diff --git a/src/cldfbench/cli_util.py b/src/cldfbench/cli_util.py index d472c3f..9035856 100644 --- a/src/cldfbench/cli_util.py +++ b/src/cldfbench/cli_util.py @@ -1,17 +1,24 @@ +""" +Utilities used in cldfbench commands. +""" import json -import typing +import logging +import pathlib +from typing import Union, Any, Optional from time import time +import functools import argparse from clldutils.clilib import ParserError -import termcolor - import pycldf import cldfbench from cldfbench import ENTRY_POINT from cldfbench import get_dataset as _get from cldfbench import get_datasets as _gets +from cldfbench.catalogs import Catalog +from cldfbench.metadata import get_creators_and_contributors +from .util import colored __all__ = ['DatasetNotFoundException', 'add_entry_point', 'add_dataset_spec', 'add_catalog_spec', @@ -19,13 +26,15 @@ 'with_dataset', 'with_datasets'] IGNORE_MISSING = '-' +red = functools.partial(colored, 'red') class DatasetNotFoundException(Exception): - pass + """Custom exception which can be used by dataset locators.""" def add_entry_point(parser: argparse.ArgumentParser, ep: str = ENTRY_POINT): + """Add option to specify an entry point group.""" parser.add_argument( '--entry-point', help='Name of entry_points to identify datasets', @@ -70,11 +79,10 @@ def get_dataset(args: argparse.Namespace) -> cldfbench.Dataset: ds = _get(args.dataset, ep=args.entry_point) if ds: return ds - raise ParserError(termcolor.colored( - '\nInvalid dataset spec: <{0}> {1}\n'.format(args.entry_point, args.dataset), "red")) + raise ParserError(red(f'\nInvalid dataset spec: <{args.entry_point}> {args.dataset}\n')) -def get_datasets(args: argparse.Namespace) -> typing.List[cldfbench.Dataset]: +def get_datasets(args: argparse.Namespace) -> list[cldfbench.Dataset]: """ Get the `cldfbench.Dataset` s specified by `args`. @@ -85,8 +93,7 @@ def get_datasets(args: argparse.Namespace) -> typing.List[cldfbench.Dataset]: res = _gets(args.dataset, ep=args.entry_point, glob=args.glob) if res: return res - raise ParserError(termcolor.colored( - '\nInvalid dataset spec: <{0}> {1}\n'.format(args.entry_point, args.dataset), "red")) + raise ParserError(red(f'\nInvalid dataset spec: <{args.entry_point}> {args.dataset}\n')) def get_cldf_dataset(args: argparse.Namespace, cldf_spec=None) -> pycldf.Dataset: @@ -129,17 +136,17 @@ def add_catalog_spec( parser.add_argument( '--' + name, metavar=name.upper(), - help='Path to repository clone of {0} data'.format(name.capitalize()), + help=f'Path to repository clone of {name.capitalize()} data', default=default) if with_version: parser.add_argument( - '--{0}-version'.format(name), - help='Version of {0} data to checkout'.format(name.capitalize()), + f'--{name}-version', + help=f'Version of {name.capitalize()} data to checkout', default=None) -def with_dataset(args: argparse.Namespace, func: typing.Union[callable, str], dataset=None) \ - -> typing.Any: +def with_dataset(args: argparse.Namespace, func: Union[callable, str], dataset=None) \ + -> Any: """ Run a callable, passing a dataset and `args` as arguments, returning it's result. @@ -155,11 +162,11 @@ def with_dataset(args: argparse.Namespace, func: typing.Union[callable, str], da if isinstance(func, str): func_ = getattr(dataset, '_cmd_' + func, getattr(dataset, 'cmd_' + func, None)) if not func_: - raise ParserError('Dataset {0} has no {1} command'.format(dataset.id, func)) + raise ParserError(f'Dataset {dataset.id} has no {func} command') func, arg = func_, [] - args.log.info('running {0} on {1} ...'.format(getattr(func, '__name__', func), dataset.id)) + args.log.info('running %s on %s ...', getattr(func, '__name__', func), dataset.id) res = func(*arg, args) - args.log.info('... done %s [%.1f secs]' % (dataset.id, time() - s)) + args.log.info('... done %s [%.1f secs]', dataset.id, time() - s) return res @@ -173,3 +180,33 @@ def with_datasets(args, func): for ds in get_datasets(args): res.append(with_dataset(args, func, dataset=ds)) return res + + +def instantiate_catalog( + cat: type, + path: Union[str, pathlib.Path], + log: logging.Logger, +) -> Optional[Catalog]: + """Try to instantiate a catalog.""" + try: + return cat(path) + except ValueError as e: # pragma: no cover + log.warning(str(e)) + return None + + +def set_creators_and_contributors(ds, md: dict[str, Any]): + """Sets the creators and contributors keys in Zenodo metadata.""" + contribs = ds.dir / 'CONTRIBUTORS.md' + if contribs.exists(): + creators, contributors = get_creators_and_contributors( + contribs.read_text(encoding='utf8'), strict=False) + for key, items in [('creators', creators), ('contributors', contributors)]: + if items: + md[key] = [_contrib(p) for p in items] + + +def _contrib(d): + return { + k: v for k, v in d.items() + if k in {'name', 'affiliation', 'orcid', 'type'} and (v or k != 'orcid')} diff --git a/src/cldfbench/commands/catconfig.py b/src/cldfbench/commands/catconfig.py index cd981f3..6a267a4 100644 --- a/src/cldfbench/commands/catconfig.py +++ b/src/cldfbench/commands/catconfig.py @@ -7,11 +7,11 @@ from clldutils.clilib import confirm from cldfcatalog import Config -from cldfbench.cli_util import add_catalog_spec +from cldfbench.cli_util import add_catalog_spec, instantiate_catalog from cldfbench.catalogs import BUILTIN_CATALOGS -def register(parser): +def register(parser): # pylint: disable=C0116 for cat in BUILTIN_CATALOGS: add_catalog_spec(parser, cat.cli_name(), with_version=False) parser.add_argument( @@ -23,27 +23,27 @@ def register(parser): parser.set_defaults(no_catalogs=True) -def run(args): +def run(args): # pylint: disable=C0116 with Config.from_file() as cfg: for cat in BUILTIN_CATALOGS: val = getattr(args, cat.cli_name()) if not val: if cat.default_location().exists(): # pragma: no cover val = cat(cat.default_location()).dir - args.log.info('Clone of {0} exists at {1} - skipping'.format( - cat.__github__, cat.default_location())) + args.log.info( + 'Clone of %s exists at %s - skipping', + cat.__github__, + cat.default_location()) elif args.quiet or confirm( - 'clone {0}?'.format(cat.__github__), default=False): # pragma: no cover - url = 'https://github.com/{0}.git'.format(cat.__github__) - args.log.info('Cloning {0} into {1} ...'.format(url, cat.default_location())) + f'clone {cat.__github__}?', default=False): # pragma: no cover + url = f'https://github.com/{cat.__github__}.git' + args.log.info('Cloning %s into %s ...', url, cat.default_location()) val = cat.clone(url).dir args.log.info('... done') else: - try: - cat(val) - except ValueError as e: # pragma: no cover - args.log.warning(str(e)) + if not instantiate_catalog(cat, val, args.log): + continue # pragma: no cover if val: cfg.add_clone(cat.cli_name(), val) - args.log.info('Config written to {0}'.format(cfg.fname())) + args.log.info('Config written to %s', cfg.fname()) diff --git a/src/cldfbench/commands/catinfo.py b/src/cldfbench/commands/catinfo.py index 4a1b64c..64e6c6b 100644 --- a/src/cldfbench/commands/catinfo.py +++ b/src/cldfbench/commands/catinfo.py @@ -1,16 +1,20 @@ """ Display information about catalogs in the system """ -import termcolor +import functools from cldfcatalog import Config -from cldfbench.cli_util import add_catalog_spec +from cldfbench.cli_util import add_catalog_spec, instantiate_catalog from cldfbench.catalogs import BUILTIN_CATALOGS -from cldfbench.util import iter_aligned +from cldfbench.util import iter_aligned, colored -def register(parser): +bold = functools.partial(colored, 'black', attrs=['bold']) +bold_underlined = functools.partial(colored, 'black', attrs=['bold', 'underline']) + + +def register(parser): # pylint: disable=C0116 for cat in BUILTIN_CATALOGS: add_catalog_spec(parser, cat.cli_name(), with_version=False) parser.add_argument( @@ -21,19 +25,16 @@ def register(parser): parser.set_defaults(no_catalogs=True) -def print_kv(k, v=''): - print('{0} {1}'.format(termcolor.colored('{0}:'.format(k), attrs=['bold']), v)) - +def run(args): # pylint: disable=C0116 + def print_kv(k: str, v: str = ''): + print(f'{bold(str(k))}\t{v}') -def run(args): cfg = Config.from_file() for cat in BUILTIN_CATALOGS: name = cat.cli_name() print() - print(termcolor.colored( - '{0} - https://github.com/{1}'.format(name, cat.__github__), - attrs=['bold', 'underline'])) + print(bold_underlined(f'{name} - https://github.com/{cat.__github__}')) print() path, from_cfg = getattr(args, name), False @@ -44,19 +45,17 @@ def run(args): args.log.warning(str(e)) continue - try: - cat = cat(path) - except ValueError as e: # pragma: no cover - args.log.warning(str(e)) + catinst = instantiate_catalog(cat, path, args.log) + if not catinst: continue - print_kv('local clone', cat.dir.resolve()) + print_kv('local clone', str(catinst.dir.resolve())) if from_cfg: - print_kv('config at', cfg.fname()) + print_kv('config at', str(cfg.fname())) print_kv('versions') - for i, version in enumerate(iter_aligned(cat.iter_versions(), prefix=' ')): - if i < args.max_versions: - print(version) + versions = [v for i, v in enumerate(catinst.iter_versions()) if i < args.max_versions] + for version in iter_aligned(versions, prefix=' ', minspace=4): + print(version) if cat.__api__: - print_kv('API', '{0.__name__} {0.__version__}'.format(cat.__api_pkg__)) + print_kv('API', f'{cat.__api_pkg__.__name__} {cat.__api_pkg__.__version__}') print() diff --git a/src/cldfbench/commands/catupdate.py b/src/cldfbench/commands/catupdate.py index 779fb3f..cb66f8d 100644 --- a/src/cldfbench/commands/catupdate.py +++ b/src/cldfbench/commands/catupdate.py @@ -6,17 +6,17 @@ """ from cldfcatalog import Config -from cldfbench.cli_util import add_catalog_spec +from cldfbench.cli_util import add_catalog_spec, instantiate_catalog from cldfbench.catalogs import BUILTIN_CATALOGS -def register(parser): +def register(parser): # pylint: disable=C0116 for cat in BUILTIN_CATALOGS: add_catalog_spec(parser, cat.cli_name(), with_version=False) parser.set_defaults(no_catalogs=True) -def run(args): +def run(args): # pylint: disable=C0116 cfg = Config.from_file() for cat in BUILTIN_CATALOGS: name = cat.cli_name() @@ -29,10 +29,9 @@ def run(args): continue if path: - try: - cat = cat(path) - except ValueError as e: # pragma: no cover - args.log.warning(str(e)) - continue - for fetch_info in cat.update(): # pragma: no cover - args.log.info('{0}: fetch {1.ref} {1.note}'.format(name, fetch_info)) + catinst = instantiate_catalog(cat, path, args.log) + if not catinst: + continue # pragma: no cover + + for fetch_info in catinst.update(): # pragma: no cover + args.log.info('%s: fetch %s %s', name, fetch_info.ref, fetch_info.note) diff --git a/src/cldfbench/commands/check.py b/src/cldfbench/commands/check.py index 7eb5fd8..a44161c 100644 --- a/src/cldfbench/commands/check.py +++ b/src/cldfbench/commands/check.py @@ -3,44 +3,45 @@ Returns 1 on validation error, else 2 if there are warnings or 0. """ -import attr +import argparse +import dataclasses + import pytest +from cldfbench import Dataset from cldfbench.cli_util import add_dataset_spec, with_datasets -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser, multiple=True) parser.add_argument('--with-tests', action='store_true', default=False) parser.add_argument('--with-validation', action='store_true', default=False) -def run(args): +def run(args): # pylint: disable=C0116 res = with_datasets(args, check) return 1 if 1 in res else (2 if 2 in res else 0) -def check(ds, args): +def check(ds: Dataset, args: argparse.Namespace) -> int: + """Check one dataset.""" success, warnings = True, [] if args.with_tests: # pragma: no cover testfile = ds.dir / "test.py" if testfile.is_file(): args.log.info("Running tests...") - pytest.main([ - '--cldf-metadata=%s' % ds.default_cldf_spec.metadata_path, - testfile - ]) + pytest.main([f'--cldf-metadata={ds.default_cldf_spec.metadata_path}', testfile]) else: args.log.warning("No tests found") if args.with_validation: args.log.info("Validating CLDF...") - for key, cldf_spec in ds.cldf_specs_dict.items(): + for cldf_spec in ds.cldf_specs_dict.values(): cldf = cldf_spec.get_dataset() success = success and cldf.validate(log=args.log) - for field in attr.fields(ds.metadata.__class__): + for field in dataclasses.fields(ds.metadata.__class__): if field.metadata.get('required', False) and not getattr(ds.metadata, field.name): - args.log.warning('Empty field "{0}" in metadata'.format(field.name)) + args.log.warning('Empty field "%s" in metadata', field.name) warnings.append(field.name) return (2 if warnings else 0) if success else 1 diff --git a/src/cldfbench/commands/ci.py b/src/cldfbench/commands/ci.py index 8868d6a..e57b780 100644 --- a/src/cldfbench/commands/ci.py +++ b/src/cldfbench/commands/ci.py @@ -9,12 +9,12 @@ from cldfbench.ci import setup, build_status_badge -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser, multiple=True) parser.add_argument('--test', help=argparse.SUPPRESS, action='store_true', default=False) -def run(args): +def run(args): # pylint: disable=C0116 dataset = get_dataset(args) if setup(dataset, force=args.test): if not args.test: # pragma: no cover diff --git a/src/cldfbench/commands/cldfreadme.py b/src/cldfbench/commands/cldfreadme.py index db62140..afdc914 100644 --- a/src/cldfbench/commands/cldfreadme.py +++ b/src/cldfbench/commands/cldfreadme.py @@ -2,37 +2,41 @@ Write markdown versions of the CLDF datasets to cldf/README.md """ from clldutils.misc import slug -from pycldf.util import metadata2markdown +from pycldf.markdown import metadata2markdown from cldfbench.cli_util import add_dataset_spec, get_dataset -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) - md = [] + ds.cldf_dir.joinpath('README.md').write_text('\n'.join(_iter_markdown(ds)), encoding='utf8') + + +def _iter_markdown(ds): cldfs = list(ds.cldf_specs_dict.values()) if len(cldfs) > 1: + # We write a short table-of-contents. def label(spec): res = spec.module dataset = spec.get_dataset() if dataset.properties.get('dc:title'): - res += ': {}'.format(dataset.properties['dc:title']) + res += f": {dataset.properties['dc:title']}" return res - md.append("# CLDF datasets\n") - md.extend([ - '- [{}](#ds-{})'.format(label(cldf), slug(cldf.metadata_fname)) for cldf in cldfs]) - md.append('') + + yield "# CLDF datasets\n" + for cldf in cldfs: + if cldf.metadata_path.exists(): + yield f'- [{label(cldf)}](#ds-{slug(cldf.metadata_fname)})' + yield '' + for cldf in cldfs: if cldf.metadata_path.exists(): kw = {} if cldf.metadata_path.parent != ds.cldf_dir: # pragma: no cover - kw['rel_path'] = '{}/'.format(cldf.metadata_path.parent.relative_to(ds.cldf_dir)) - md.append(' \n'.format(slug(cldf.metadata_fname))) + kw['rel_path'] = f'{cldf.metadata_path.parent.relative_to(ds.cldf_dir)}/' + yield f' \n' res = metadata2markdown(cldf.get_dataset(), cldf.metadata_path, **kw) - md.append(res.replace('# ', '# {} '.format(cldf.module), 1)) - md.append('\n') - - ds.cldf_dir.joinpath('README.md').write_text('\n'.join(md), encoding='utf8') + yield res.replace('# ', f'# {cldf.module} ', 1) + '\n' diff --git a/src/cldfbench/commands/diff.py b/src/cldfbench/commands/diff.py index c3eb48b..8ba69b1 100644 --- a/src/cldfbench/commands/diff.py +++ b/src/cldfbench/commands/diff.py @@ -16,12 +16,12 @@ from cldfbench.cli_util import with_dataset, add_dataset_spec -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser) parser.add_argument('--verbose', action='store_true', default=False) -def run(args): +def run(args): # pylint: disable=C0116 res = with_dataset(args, diff) if res == 2: args.log.info('----------------------------------------------------------------------') @@ -30,48 +30,50 @@ def run(args): return res -def print_diff(diff, d): # pragma: no cover - a = diff.a_blob.data_stream.read().decode('utf-8').splitlines() - b = d.joinpath(diff.a_path).read_text(encoding='utf8').splitlines() - print('\n'.join(difflib.unified_diff(a, b, fromfile=diff.a_path, lineterm='', n=1))) +def print_diff(diff_, d): # pragma: no cover + """Print file diff.""" + a = diff_.a_blob.data_stream.read().decode('utf-8').splitlines() + b = d.joinpath(diff_.a_path).read_text(encoding='utf8').splitlines() + print('\n'.join(difflib.unified_diff(a, b, fromfile=diff_.a_path, lineterm='', n=1))) -def diff(ds, args): +def diff(ds, args) -> int: + """Inspect repository differences.""" try: repo = git.Repo(str(ds.dir)) except git.InvalidGitRepositoryError: # pragma: no cover - args.log.warning('{} is not a git repository. Cannot diff'.format(ds.dir)) - return + args.log.warning('%s is not a git repository. Cannot diff', ds.dir) + return 0 md_changed = None print(repo.git.status('cldf')) - diff = repo.index.diff(None) + diff_ = repo.index.diff(None) if args.verbose: # pragma: no cover - for diff_item in diff.iter_change_type('M'): + for diff_item in diff_.iter_change_type('M'): print_diff(diff_item, ds.dir) - for item in diff: + for item in diff_: if item.a_path.startswith('cldf/'): p = pathlib.Path(item.a_path) if (not p.name.startswith('.')) and p.name != 'requirements.txt': if p.name.endswith('metadata.json'): md_changed = item.a_path else: # pragma: no cover - args.log.warning('Data file {} changed!'.format(p)) + args.log.warning('Data file %s changed!', p) return 2 def log_diff(dold, dnew, thing='metadata'): - diff = False + local_diff = False for k, v in dnew.items(): if k not in dold: - args.log.warning('New {}: {}: {}'.format(thing, k, v)) - diff = True + args.log.warning('New %s: %s: %s', thing, k, v) + local_diff = True elif v != dold[k]: - args.log.warning('Changed {}: {}: {} -> {}'.format(thing, k, dold[k], v)) - diff = True - return diff + args.log.warning('Changed %s: %s: %s -> %s', thing, k, dold[k], v) + local_diff = True + return local_diff def derived_to_dict(d): return { @@ -81,13 +83,14 @@ def derived_to_dict(d): if md_changed: exclude = {'tables', 'prov:wasGeneratedBy', 'prov:wasDerivedFrom'} - old = json.loads(repo.git.show('HEAD:{0}'.format(md_changed))) + old = json.loads(repo.git.show(f'HEAD:{md_changed}')) new = jsonlib.load(ds.dir / md_changed) - diff = any([ + diff_ = any([ log_diff(derived_to_dict(old), derived_to_dict(new), thing='repository version'), log_diff( {k: v for k, v in old.items() if k not in exclude}, {k: v for k, v in new.items() if k not in exclude}, )]) - return 2 if diff else 0 + return 2 if diff_ else 0 + return 0 # pragma: no cover diff --git a/src/cldfbench/commands/download.py b/src/cldfbench/commands/download.py index 71f6659..0c89ce7 100644 --- a/src/cldfbench/commands/download.py +++ b/src/cldfbench/commands/download.py @@ -4,9 +4,9 @@ from cldfbench.cli_util import with_dataset, add_dataset_spec -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser) -def run(args): +def run(args): # pylint: disable=C0116 with_dataset(args, 'download') diff --git a/src/cldfbench/commands/geojson.py b/src/cldfbench/commands/geojson.py index 00b490d..dc35226 100644 --- a/src/cldfbench/commands/geojson.py +++ b/src/cldfbench/commands/geojson.py @@ -8,11 +8,11 @@ from cldfbench.cli_util import add_dataset_spec, get_dataset -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser) -def run(args): +def run(args): # pylint: disable=C0116 ds = get_dataset(args) languages_with_coordinates = {} for spec in ds.cldf_specs_dict.values(): @@ -30,7 +30,7 @@ def run(args): float(language.pop(lon.name)), language) geojson = {"type": "FeatureCollection", "features": []} - for id, (lat, lon, props) in languages_with_coordinates.items(): + for lat, lon, props in languages_with_coordinates.values(): geojson['features'].append({ "type": "Feature", "geometry": {"type": "Point", "coordinates": [lon, lat]}, diff --git a/src/cldfbench/commands/info.py b/src/cldfbench/commands/info.py index de065f7..bf4a038 100644 --- a/src/cldfbench/commands/info.py +++ b/src/cldfbench/commands/info.py @@ -4,7 +4,7 @@ from cldfbench.cli_util import with_datasets, add_dataset_spec -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser, multiple=True) parser.add_argument( '--cldf', @@ -13,7 +13,7 @@ def register(parser): default=False) -def run(args): +def run(args): # pylint: disable=C0116 def _print(ds, args): if args.cldf: for cldf in ds.cldf_specs_dict.values(): diff --git a/src/cldfbench/commands/ls.py b/src/cldfbench/commands/ls.py index f0fef0e..4611bb6 100644 --- a/src/cldfbench/commands/ls.py +++ b/src/cldfbench/commands/ls.py @@ -1,15 +1,17 @@ """ List installed datasets. """ +import io import inspect -from clldutils.markup import Table +from clldutils.clilib import Table, add_format from cldfbench.cli_util import add_dataset_spec, get_datasets -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser, multiple=True) + add_format(parser, 'simple') parser.add_argument( '--modules', help="List only python modules, suitable as DATASET arguments for other commands.", @@ -17,12 +19,11 @@ def register(parser): default=False) -def run(args): - t = Table('id', 'dir', 'title') - for ds in get_datasets(args): - if args.modules: - print(inspect.getfile(ds.__class__)) - continue - t.append((ds.id, ds.dir, getattr(ds.metadata, 'title', ''))) - if not args.modules: - print(t.render(tablefmt='simple')) +def run(args): # pylint: disable=C0116 + kw = {'file': io.StringIO()} if args.modules else {} + with Table(args, 'id', 'dir', 'title', **kw) as t: + for ds in get_datasets(args): + if args.modules: + print(inspect.getfile(ds.__class__)) + continue + t.append((ds.id, ds.dir, getattr(ds.metadata, 'title', ''))) diff --git a/src/cldfbench/commands/makecldf.py b/src/cldfbench/commands/makecldf.py index fe85404..19921b1 100644 --- a/src/cldfbench/commands/makecldf.py +++ b/src/cldfbench/commands/makecldf.py @@ -5,7 +5,7 @@ from cldfbench.commands import cldfreadme, zenodo -def register(parser): +def register(parser): # pylint: disable=C0116 parser.add_argument( '--with-cldfreadme', help="Run 'cldfbench cldfreadme' after successfull CLDF creation", @@ -28,7 +28,7 @@ def register(parser): add_catalog_spec(parser, 'glottolog') -def run(args): +def run(args): # pylint: disable=C0116 with_dataset(args, 'makecldf') if getattr(args, 'with_cldfreadme', None): cldfreadme.run(args) diff --git a/src/cldfbench/commands/media.py b/src/cldfbench/commands/media.py index e267d8f..42e9687 100644 --- a/src/cldfbench/commands/media.py +++ b/src/cldfbench/commands/media.py @@ -23,34 +23,39 @@ last slash) - it is necessary to log in via correct zenodo user and to have the corresponding access token in your environment - - it is only necessary to fill in required fields with provisional data - see step (6) - (6) call cldfbench media --upload-zenodo deposit_ID - to update the metadata of the previous uploaded reelease """ +import functools import os +import re import html +import shutil import time import pathlib import zipfile +import itertools import threading import collections +from collections.abc import Generator +import dataclasses +from typing import Optional, Any from datetime import datetime from urllib.request import urlretrieve +from urllib.parse import urlparse -from cldfbench.cli_util import add_dataset_spec, get_dataset -from cldfbench.datadir import DataDir -from cldfbench.metadata import get_creators_and_contributors +import csvw +from csvw.datatypes import anyURI +from csvw.dsv import UnicodeWriter from clldutils import jsonlib from clldutils.clilib import PathType, ParserError from clldutils.misc import format_size, nfilter from clldutils.path import md5, git_describe -from csvw.dsv import UnicodeWriter -from csvw.datatypes import anyURI -from zenodoclient.api import Zenodo, API_URL, API_URL_SANDBOX, ACCESS_TOKEN -from zenodoclient.models import PUBLISHED +from pycldf import Dataset as CLDFDataset import tqdm +from cldfbench.cli_util import add_dataset_spec, get_dataset, set_creators_and_contributors +from cldfbench.datadir import DataDir +ZENODO_DOI_PATTERN = re.compile(r'10\.5281/zenodo\.(?P[0-9]+)$') MEDIA = 'media' ZENODO_FILE_NAME = 'zenodo.json' COMMUNITIES = ['lexibank'] @@ -73,7 +78,7 @@ DESCRIPTION = "{title}{formats}{supplement_to} {descr} {online}" -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser, multiple=True) parser.add_argument( '-m', '--mimetype', @@ -104,21 +109,6 @@ def register(parser): help='DOI to which this release refers (format 10.5281/zenodo.1234567). It is required ' 'for --create-release.', ) - parser.add_argument( - '--create-release', - help='Switch to create ID_{0} directory containing {0}.zip, README.md and {1} for ' - 'releasing on zenodo. Cannot be used with --update-zenodo.'.format( - MEDIA, ZENODO_FILE_NAME), # noqa: E122 - action='store_true', - default=False, - ) - parser.add_argument( - '--update-zenodo', - help="Deposit ID (number after DOI's last slash) to update metadata by using ID_{0}/{1}. " - "Cannot be used with --create-release.".format( - MEDIA, ZENODO_FILE_NAME), # noqa: E122 - default=None, - ) parser.add_argument( '--debug', help='Switch to work with max. 500 media files and with sandbox.zenodo for testing ONLY', @@ -127,10 +117,7 @@ def register(parser): ) -def _create_download_thread(url, target): - global download_threads - download_threads = [] - +def _create_download_thread(url, target, download_threads): def _download(url, target): assert not target.exists() urlretrieve(url, str(target)) @@ -143,212 +130,257 @@ def _download(url, target): download_threads.append(download_thread) -def run(args): +@dataclasses.dataclass(frozen=True) +class Row: + """A row in a media table with info about the location of the associated file.""" + id: str + mimetype: str + data: dict[str, Any] + url: Optional[str] = None + local_path: Optional[pathlib.Path] = None + + @property + def ext(self) -> str: + """Filename extension gleaned from the URL""" + return urlparse(self.data['URL']).path.split('.')[-1].lower() + + def download(self, target: pathlib.Path, download_threads: list): + """Retrieve the associated media file either by copy or by doanload.""" + if self.local_path: + shutil.copy(self.local_path, target) + else: + _create_download_thread(self.url, target, download_threads) + + +@dataclasses.dataclass +class MediaTableSpec: + """A table together with column access info.""" + table: csvw.Table + id_col: str + media_type_col: str + _ds: CLDFDataset + + @classmethod + def from_dataset(cls, ds_cldf) -> 'MediaTableSpec': + """ + A dataset may contain a regular MediaTable component, or just a table with url media.csv. + """ + media_table = ds_cldf.get('MediaTable', ds_cldf.get('media.csv', None)) + if media_table is None: + raise ValueError() # pragma: no cover + + col_names = {'Media_Type': 'mimetype', 'id': 'ID'} + for prop in col_names: + col = ds_cldf.get(('MediaTable', prop)) + if col: + col_names[prop] = col.name + return cls(media_table, col_names['id'], col_names['Media_Type'], _ds=ds_cldf) + + def __iter__(self) -> Generator[Row, None, None]: + for row in self.table: + row['URL'] = anyURI.to_string(self._ds.get_row_url(self.table, row)) + url, local_src = row['URL'], None + if not row['URL'].startswith('http'): + url = None + local_src = self._ds.directory / row['URL'] + if not local_src.exists(): + continue + yield Row(row[self.id_col], row[self.media_type_col], row, url, local_src) + + +def _valid_input(args) -> bool: + if args.parent_doi and not ZENODO_DOI_PATTERN.match(args.parent_doi): + args.log.error('Invalid passed DOI') + return False + if not args.list: + if not args.parent_doi: + args.log.error('The corresponding DOI is required (via --parent-doi).') + return False + return True + + +@dataclasses.dataclass(frozen=True) +class File: + """Metadata about a media file.""" + path: pathlib.Path + mimetype: Optional[str] = None + size: Optional[int] = None + + @functools.cached_property + def ext(self) -> str: + """Filename extension, aka suffix without the dot.""" + return self.path.suffix.replace('.', '') + + @property + def key(self) -> str: + """Filetype formatted as human-readable string.""" + return f"{self.mimetype} ({self.ext})" if self.mimetype else None + + +@dataclasses.dataclass +class MediaDir: + """A container for media file metadata.""" + path: pathlib.Path + files: list[File] = dataclasses.field(default_factory=list) + rows: list[dict[str, Any]] = dataclasses.field(default_factory=list) + + def __post_init__(self): + self.path.mkdir(exist_ok=True) + + @property + def index(self) -> pathlib.Path: + """The location of the file index.""" + return self.path / INDEX_CSV + + def write_index(self): + """Write the file metadata to a csv file.""" + with UnicodeWriter(self.index) as w: + for i, row in enumerate(self.rows): + if i == 0: + w.writerow(row.keys()) + w.writerow(row.values()) + + def add(self, row) -> pathlib.Path: + """Add a file and return its target path in media_dir.""" + size = row.data.get('size') + d = self.path / row.id[:2] + f = File(d / '.'.join([row.id, row.ext]), row.mimetype, int(size) if size else None) + row.data['local_path'] = pathlib.Path(d.name) / f.path.name + self.rows.append(row.data) + self.files.append(f) + return f.path + + @functools.cached_property + def extensions(self) -> set[str]: + """The set of filename extensions used for the media files in the dataset.""" + return {f.ext for f in self.files} + + def print_stats(self): + """Print summary stats about the media files in the dataset.""" + size_by_mimetype = collections.Counter() + count_by_mimetype = collections.Counter() + for f in self.files: + size_by_mimetype[f.key] += f.size or 0 + count_by_mimetype.update([f.key]) + + for k, v in size_by_mimetype.most_common(): + print('\t'.join([k.ljust(20), str(count_by_mimetype[k]), format_size(v)])) + + +def run(args): # pylint: disable=C0116 ds = get_dataset(args) ds_cldf = ds.cldf_reader() - release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA) + download_threads = [] - media_table = ds_cldf.get('MediaTable', ds_cldf.get('media.csv', None)) + if not _valid_input(args): + raise ParserError - if media_table is None: # pragma: no cover + try: + media_table = MediaTableSpec.from_dataset(ds_cldf) + except ValueError as e: # pragma: no cover args.log.error('Dataset has no MediaTable or media.csv') - raise ParserError - if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi): - args.log.error('Invalid passed DOI') - raise ParserError - if args.update_zenodo: # pragma: no cover - if not release_dir.exists(): - args.log.error('"{0}" not found -- run --create-release first?'.format( - release_dir)) - raise ParserError - if not (release_dir / ZENODO_FILE_NAME).exists(): - args.log.error('"{0}" not found -- run --create-release first?'.format( - release_dir / ZENODO_FILE_NAME)) - raise ParserError - if args.create_release: - args.log.error('You cannot create the release and update zenodo at the same time.') - raise ParserError - if args.create_release: - if not args.parent_doi: - args.log.error('The corresponding DOI is required (via --parent-doi).') - raise ParserError - - mime_types = None - if args.mimetype: - mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))] - - size = collections.Counter() - number = collections.Counter() - media_dir = args.out / MEDIA - media = [] - used_file_extensions = set() - - if not args.update_zenodo: - media_dir.mkdir(exist_ok=True) - with UnicodeWriter(media_dir / INDEX_CSV if not args.list else None) as w: - for i, row in enumerate(tqdm.tqdm( - [r for r in media_table], desc='Getting {0} items'.format(MEDIA))): - row['URL'] = url = anyURI.to_string(ds_cldf.get_row_url(media_table, row)) - # - # FIXME: Don't assume URLs without query! - # - f_ext = url.split('.')[-1].lower() - if args.debug and i > 500: - break # pragma: no cover - if (mime_types is None) or f_ext in mime_types\ - or any(row['mimetype'].startswith(x) for x in mime_types): - if args.list: - m = '{0} ({1})'.format(row['mimetype'], f_ext) - size[m] += int(row['size']) - number.update([m]) - else: - used_file_extensions.add(f_ext.lower()) - d = media_dir / row['ID'][:2] - d.mkdir(exist_ok=True) - fn = '.'.join([row['ID'], f_ext]) - target = d / fn - row['local_path'] = pathlib.Path(row['ID'][:2]) / fn - if i == 0: - w.writerow(row) - w.writerow(row.values()) - media.append(target) - if (not target.exists()) or md5(target) != row['ID']: - _create_download_thread(url, target) + raise ParserError from e + + mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))] if args.mimetype else [] + media_dir = MediaDir(args.out / MEDIA) + + for i, row in enumerate(tqdm.tqdm(media_table, desc='Getting media items')): + if args.debug and i > 500: + break # pragma: no cover + + if any((not mime_types, + row.ext in mime_types, + any(row.mimetype.startswith(x) for x in mime_types))): + target = media_dir.add(row) + if not args.list: + # We do not only list stats about the media files, but retrieve them. + target.parent.mkdir(exist_ok=True) + if (not target.exists()) or md5(target) != row.id: + row.download(target, download_threads) if args.list: - for k, v in size.most_common(): - print('\t'.join([k.ljust(20), str(number[k]), format_size(v)])) + media_dir.print_stats() return # Waiting for the download threads to finish - if 'download_threads' in globals(): - for t in download_threads: - t.join() - - if args.create_release: - assert media_dir.exists(), 'No folder "{0}" found in {1}'.format(MEDIA, media_dir.resolve()) - release_dir.mkdir(exist_ok=True) - media.append(media_dir / INDEX_CSV) - - try: - zipf = zipfile.ZipFile( - str(release_dir / '{0}.zip'.format(MEDIA)), 'w', zipfile.ZIP_DEFLATED) - fp = args.out - for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)): - zipf.write(str(f), str(os.path.relpath(str(f), str(fp)))) - zipf.close() - except Exception as e: # pragma: no cover - args.log.error(e) - raise - - def _contrib(d): - return {k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'}} - - version_v = git_describe('.').split('-')[0] - version = version_v.replace('v', '') - git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '') - with jsonlib.update( - release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md: - contribs = ds.dir / 'CONTRIBUTORS.md' - creators, contributors = get_creators_and_contributors( - contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False) - if creators: - md['creators'] = [_contrib(p) for p in creators] - if contributors: - md['contributors'] = [_contrib(p) for p in contributors] - communities = [r["identifier"] for r in md.get("communities", [])] + \ - [c.strip() for c in nfilter(args.communities.split(','))] + \ - COMMUNITIES - if communities and not args.debug: - md['communities'] = [ - {"identifier": community_id} for community_id in sorted(set(communities))] - md.update( - { - 'title': '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()), - 'access_right': 'open', - 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), - 'upload_type': 'dataset', - 'publication_date': datetime.today().strftime('%Y-%m-%d'), - 'version': version, - 'related_identifiers': [ - { - 'scheme': 'url', - 'identifier': '{0}/tree/{1}'.format(git_url, version_v), - 'relation': 'isSupplementTo' - }, - ], - } - ) - if args.parent_doi: - md['related_identifiers'].append({ - 'scheme': 'doi', 'identifier': args.parent_doi, 'relation': 'isPartOf'}) - supplement_to = " - Supplement to dataset " \ - "{1} ".format( - args.parent_doi, ds.metadata.title) # noqa: E122 - if ds.metadata.url: - md['related_identifiers'].append({ - 'scheme': 'url', - 'identifier': ds.metadata.url, - 'relation': 'isAlternateIdentifier'}) - - formats = ', '.join(sorted(used_file_extensions)) - descr = '

' + ds.metadata.description if ds.metadata.description else '' - online_url, online = '', '' - if ds.metadata.url: - online_url = ds.metadata.url - online = "

Available online at: {0}".format(online_url) - md['description'] = html.escape(DESCRIPTION.format( - url=online_url, - formats=' ({0})'.format(formats) if formats else '', - title=md['title'], - supplement_to=supplement_to, - descr=descr, - online=online)) - - license_md = '' - if ds.metadata.zenodo_license: - md['license'] = {'id': ds.metadata.zenodo_license} - license_md = LICENCE.format(ds.metadata.zenodo_license) - - DataDir(release_dir).write('README.md', README.format( - title=md['title'], - doi='https://doi.org/{0}'.format(args.parent_doi), - ds_title=ds.metadata.title, - license=license_md, - formats=' ({0})'.format(formats) if formats else '', - media=MEDIA, - index=INDEX_CSV)) - - if args.update_zenodo: # pragma: no cover - md = jsonlib.load(release_dir / ZENODO_FILE_NAME) - - if args.debug: - api_url = API_URL_SANDBOX - access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN') - else: - api_url = API_URL - access_token = ACCESS_TOKEN - zenodo_url = api_url.replace('api/', '') - - args.log.info('Updating Deposit ID {0} on {1} with:'.format(args.update_zenodo, zenodo_url)) - api = Zenodo(api_url=api_url, access_token=access_token) - try: - rec = api.record_from_id('{0}record/{1}'.format(zenodo_url, args.update_zenodo)) - except Exception as e: - args.log.error('Check connection and credentials for accessing Zenodo.\n{0}'.format(e)) - return - latest_version = rec.links['latest'].split('/')[-1] - if latest_version != args.update_zenodo: - args.log.warn('Passed deposit ID does not refer to latest version {0}!'.format( - latest_version)) - args.log.info(' DOI: ' + rec.metadata.doi) - args.log.info(' Title: ' + rec.metadata.title) - args.log.info(' Version: ' + rec.metadata.version) - args.log.info(' Date: ' + rec.metadata.publication_date) - args.log.info(' Files: ' + ', '.join([f.key for f in rec.files])) - p = input("Proceed? [y/N]: ") - if p.lower() == 'y': - dep = api.update_deposit(args.update_zenodo, **md) - if dep.state != PUBLISHED: - api.publish_deposit(dep) - args.log.info('Updated successfully') + for t in download_threads: + t.join() + + media_dir.write_index() + release_dir = args.out / f'{ds.id}_{MEDIA}' + release_dir.mkdir(exist_ok=True) + _zip_media(release_dir, [media_dir.index] + [f.path for f in media_dir.files], args) + _release_metadata(release_dir, ds, args, media_dir.extensions) + + +def _zip_media(release_dir, media, args): + try: + with zipfile.ZipFile(release_dir / f'{MEDIA}.zip', 'w', zipfile.ZIP_DEFLATED) as zf: + for f in tqdm.tqdm(media, desc=f'Creating {MEDIA}.zip'): + zf.write(f, str(os.path.relpath(str(f), str(args.out)))) + except Exception as e: # pragma: no cover + args.log.error(e) + raise + + +def _release_metadata(release_dir, ds, args, used_file_extensions): + version_v = git_describe('.').split('-')[0] + git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '') + with (jsonlib.update( + release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md): + set_creators_and_contributors(ds, md) + communities = list(itertools.chain( + [r["identifier"] for r in md.get("communities", [])], + [c.strip() for c in nfilter(args.communities.split(','))], + COMMUNITIES)) + if communities and not args.debug: + md['communities'] = [ + {"identifier": community_id} for community_id in sorted(set(communities))] + md.update( + { + 'title': f'{ds.metadata.title} {MEDIA.title()} Files', + 'access_right': 'open', + 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])), + 'upload_type': 'dataset', + 'publication_date': datetime.today().strftime('%Y-%m-%d'), + 'version': version_v.replace('v', ''), + 'related_identifiers': [], + } + ) + _add_rel_id(md, 'url', f'{git_url}/tree/{version_v}', 'isSupplementTo') + + supplement_to = '' + if args.parent_doi: + _add_rel_id(md, 'doi', args.parent_doi, 'isPartOf') + supplement_to = f" - Supplement to dataset " \ + f"{ds.metadata.title} " + if ds.metadata.url: + _add_rel_id(md, 'url', ds.metadata.url, 'isAlternateIdentifier') + + formats = ', '.join(sorted(used_file_extensions)) + md['description'] = html.escape(DESCRIPTION.format( + url=ds.metadata.url or '', + formats=' ({formats})' if formats else '', + title=md['title'], + supplement_to=supplement_to, + descr='

' + ds.metadata.description if ds.metadata.description else '', + online=f"

Available online at: " + f"{ds.metadata.url}" if ds.metadata.url else '')) + + if ds.metadata.zenodo_license: + md['license'] = {'id': ds.metadata.zenodo_license} + + DataDir(release_dir).write('README.md', README.format( + title=md['title'], + doi=f'https://doi.org/{args.parent_doi}', + ds_title=ds.metadata.title, + license=LICENCE.format( + ds.metadata.zenodo_license) if ds.metadata.zenodo_license else '', + formats=f' ({formats})' if formats else '', + media=MEDIA, + index=INDEX_CSV)) + + +def _add_rel_id(md, scheme, identifier, relation): + md['related_identifiers'].append( + {'scheme': scheme, 'identifier': identifier, 'relation': relation}) diff --git a/src/cldfbench/commands/new.py b/src/cldfbench/commands/new.py index 71930b4..6bd37c8 100644 --- a/src/cldfbench/commands/new.py +++ b/src/cldfbench/commands/new.py @@ -4,19 +4,20 @@ import pathlib import collections -from cldfbench import scaffold +from cldfbench.scaffold import iter_scaffolds -_templates = None +_templates = None # pylint: disable=invalid-name def get_template_dict(): - global _templates + """Read available dataset templates.""" + global _templates # pylint: disable=W0603 if _templates is None: - _templates = collections.OrderedDict(scaffold.iter_scaffolds()) + _templates = collections.OrderedDict(iter_scaffolds()) return _templates -def register(parser): +def register(parser): # pylint: disable=C0116 templates = list(get_template_dict().keys()) parser.add_argument( '--template', @@ -30,7 +31,7 @@ def register(parser): default=pathlib.Path('.')) -def run(args): +def run(args): # pylint: disable=C0116 tmpl = get_template_dict()[args.template]() md = tmpl.metadata.elicit() tmpl.render(args.out, md) diff --git a/src/cldfbench/commands/readme.py b/src/cldfbench/commands/readme.py index 089ce45..925d73e 100644 --- a/src/cldfbench/commands/readme.py +++ b/src/cldfbench/commands/readme.py @@ -4,9 +4,9 @@ from cldfbench.cli_util import add_dataset_spec, with_datasets -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser, multiple=True) -def run(args): +def run(args): # pylint: disable=C0116 with_datasets(args, 'readme') diff --git a/src/cldfbench/commands/run.py b/src/cldfbench/commands/run.py index 45b1db7..d43306d 100644 --- a/src/cldfbench/commands/run.py +++ b/src/cldfbench/commands/run.py @@ -6,11 +6,11 @@ from cldfbench.cli_util import with_dataset, add_dataset_spec -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser) parser.add_argument('cmd', help='command to run on the dataset') parser.add_argument('args', nargs=argparse.REMAINDER) -def run(args): +def run(args): # pylint: disable=C0116 with_dataset(args, args.cmd) diff --git a/src/cldfbench/commands/stub.py b/src/cldfbench/commands/stub.py index 334eb23..904c14b 100644 --- a/src/cldfbench/commands/stub.py +++ b/src/cldfbench/commands/stub.py @@ -4,9 +4,9 @@ from cldfbench.cli_util import add_catalog_spec -def register(parser): +def register(parser): # pylint: disable=C0116 add_catalog_spec(parser, 'concepticon') -def run(args): # pragma: no cover - pass +def run(args): # pragma: no cover # pylint: disable=C0116 + print(args) diff --git a/src/cldfbench/commands/zenodo.py b/src/cldfbench/commands/zenodo.py index fb54082..9a7e96d 100644 --- a/src/cldfbench/commands/zenodo.py +++ b/src/cldfbench/commands/zenodo.py @@ -7,11 +7,10 @@ from clldutils.jsonlib import update from clldutils.misc import nfilter -from cldfbench.cli_util import add_dataset_spec, get_dataset -from cldfbench.metadata import get_creators_and_contributors +from cldfbench.cli_util import add_dataset_spec, get_dataset, set_creators_and_contributors -def register(parser): +def register(parser): # pylint: disable=C0116 add_dataset_spec(parser, multiple=True) parser.add_argument( '--communities', @@ -20,17 +19,11 @@ def register(parser): ) -def run(args): +def run(args): # pylint: disable=C0116 dataset = get_dataset(args) with update(dataset.dir / '.zenodo.json', indent=4, default=collections.OrderedDict()) as md: modules = ['cldf:' + spec.module for spec in dataset.cldf_specs_dict.values()] - contribs = dataset.dir / 'CONTRIBUTORS.md' - creators, contributors = get_creators_and_contributors( - contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False) - if creators: - md['creators'] = [contrib(p) for p in creators] - if contributors: - md["contributors"] = [contrib(p) for p in contributors] + set_creators_and_contributors(dataset, md) communities = [r["identifier"] for r in md.get("communities", [])] + \ [c.strip() for c in nfilter(args.communities.split(','))] if communities: @@ -45,14 +38,8 @@ def run(args): } ) if dataset.metadata.citation: - md['description'] = "

Cite the source of the dataset as:

\n\n" \ - "
\n

{}

\n
".format( - html.escape(dataset.metadata.citation)) + md['description'] = \ + f"

Cite the source of the dataset as:

\n\n" \ + f"
\n

{html.escape(dataset.metadata.citation)}

\n
" if dataset.metadata.zenodo_license: md['license'] = {'id': dataset.metadata.zenodo_license} - - -def contrib(d): - return { - k: v for k, v in d.items() - if k in {'name', 'affiliation', 'orcid', 'type'} and (v or k != 'orcid')} diff --git a/src/cldfbench/datadir.py b/src/cldfbench/datadir.py index ec79926..7caf31d 100644 --- a/src/cldfbench/datadir.py +++ b/src/cldfbench/datadir.py @@ -1,16 +1,20 @@ +""" +Functionality to access structured data in the file system. +""" import gzip import shutil -import typing +import logging +from typing import Optional, Union, Literal import pathlib import zipfile +import functools import itertools import contextlib from xml.etree import ElementTree as et import collections +from collections.abc import Iterable import unicodedata - -import requests -import termcolor +import urllib.request try: from odf.opendocument import load as load_odf @@ -33,20 +37,24 @@ from clldutils import jsonlib from pycldf.sources import Source +from .util import colored -__all__ = ['get_url', 'DataDir'] +__all__ = ['DataDir', 'urlopen'] +HTTP_REQUEST_TIMEOUT = 10 ODF_NS_TABLE = 'urn:oasis:names:tc:opendocument:xmlns:table:1.0' ODF_NS_TEXT = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0' +PathType = Union[str, pathlib.Path] +PathDictType = dict[str, pathlib.Path] +LogType = Optional[logging.Logger] def _real_len(seq, pred=bool): for index in range(len(seq) - 1, -1, -1): if pred(seq[index]): return index + 1 - else: - return 0 + return 0 def _ods_value(cell): @@ -77,8 +85,7 @@ def _ods_cells(row): def _pad_list(li, length): if len(li) >= length: return li - else: - return [e for e in itertools.chain(li, itertools.repeat('', length - len(li)))] + return list(itertools.chain(li, itertools.repeat('', length - len(li)))) def _ods_to_list(table): @@ -102,20 +109,28 @@ def _ods_to_list(table): for cloned_row in itertools.repeat(row, number)] -def get_url(url: str, log=None, **kw) -> requests.Response: - res = requests.get(url, **kw) - if log: - level = log.info if res.status_code == 200 else log.warning - level('HTTP {0} for {1}'.format( - termcolor.colored(res.status_code, 'blue'), termcolor.colored(url, 'blue'))) - return res +@contextlib.contextmanager +def urlopen(url, timeout=HTTP_REQUEST_TIMEOUT): + """ + Open URLs + - without raising an exception on HTTP errors, + - passing a specific User-Agent header, + - specifying a timeout. + """ + class NonRaisingHTTPErrorProcessor(urllib.request.HTTPErrorProcessor): + """Don't raise exceptions on HTTP errors.""" + http_response = https_response = lambda self, req, res: res # pylint: disable=C3001 + + opener = urllib.request.build_opener(NonRaisingHTTPErrorProcessor) + opener.addheaders = [('User-agent', 'cldfbench/2.0.0')] + yield opener.open(urllib.request.Request(url), timeout=timeout) class DataDir(type(pathlib.Path())): """ A `pathlib.Path` augmented with functionality to read common data formats. """ - def _path(self, fname: typing.Union[str, pathlib.Path]) -> pathlib.Path: + def _path(self, fname: PathType) -> pathlib.Path: """ Interpret strings without "/" as names of files in `self`. @@ -126,12 +141,14 @@ def _path(self, fname: typing.Union[str, pathlib.Path]) -> pathlib.Path: return self / fname return pathlib.Path(fname) - def read(self, - fname: typing.Union[str, pathlib.Path], - aname: str = None, - normalize: str = None, - suffix: str = None, - encoding: str = 'utf8') -> str: + def read( # pylint: disable=R0913,R0917 + self, + fname: PathType, + aname: str = None, + normalize: Optional[Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None, + suffix: str = None, + encoding: str = 'utf8', + ) -> str: """ Read text data from a file. @@ -144,8 +161,8 @@ def read(self, p = self._path(fname) suffix = suffix or p.suffix if suffix == '.zip': - zip = zipfile.ZipFile(str(p)) - text = zip.read(aname or zip.namelist()[0]).decode(encoding) + with zipfile.ZipFile(str(p)) as zipf: + text = zipf.read(aname or zipf.namelist()[0]).decode(encoding) elif suffix == '.gz': with gzip.open(p) as fp: text = fp.read().decode(encoding) @@ -156,7 +173,7 @@ def read(self, text = unicodedata.normalize(normalize, text) return text - def write(self, fname: typing.Union[str, pathlib.Path], text: str, encoding='utf8'): + def write(self, fname: PathType, text: str, encoding='utf8'): """ Write text data to a file. @@ -165,53 +182,53 @@ def write(self, fname: typing.Union[str, pathlib.Path], text: str, encoding='utf self._path(fname).write_text(text, encoding=encoding) return fname - def read_csv(self, - fname: typing.Union[str, pathlib.Path], - normalize=None, **kw) -> typing.List[typing.Union[dict, list]]: + def read_csv( + self, + fname: PathType, + normalize: Optional[Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None, + **kw, + ) -> list[Union[dict[str, str], list[str]]]: """ Read CSV data from a file. """ + reader = dsv.reader(self._path(fname), **kw) + if not normalize: - return list(dsv.reader(self._path(fname), **kw)) - if kw.get('dicts'): - return [collections.OrderedDict( - [(k, unicodedata.normalize(normalize, v)) for k, v in row.items()] - ) for row in dsv.reader(self._path(fname), **kw)] - else: - return [[unicodedata.normalize(normalize, k) for k in row] - for row in dsv.reader(self._path(fname), **kw)] + return list(reader) - def write_csv(self, - fname: typing.Union[str, pathlib.Path], - rows: typing.Iterable[typing.List[str]], **kw): + norm = functools.partial(unicodedata.normalize, normalize) + + if not kw.get('dicts'): + return [[norm(k) for k in row] for row in reader] + + return [collections.OrderedDict([(k, norm(v)) for k, v in row.items()]) for row in reader] + + def write_csv(self, fname: PathType, rows: Iterable[list[str]], **kw): """ Write CSV data to a file. """ with dsv.UnicodeWriter(self._path(fname), **kw) as writer: writer.writerows(rows) - def read_xml(self, fname: typing.Union[str, pathlib.Path], wrap=True) -> et.Element: + def read_xml(self, fname: PathType, wrap=True) -> et.Element: """ Reads and parses XML from a file. """ xml = xmlchars(self.read(fname)) if wrap: - xml = '{0}'.format(xml) + xml = f'{xml}' return et.fromstring(xml.encode('utf8')) - def read_json(self, - fname: typing.Union[str, pathlib.Path], - **kw) -> typing.Union[str, list, dict]: + def read_json(self, fname: PathType, **_) -> Union[str, list, dict]: + """Read a JSON file.""" return jsonlib.load(self._path(fname)) - def read_bib(self, - fname: typing.Union[str, pathlib.Path] = 'sources.bib') -> typing.List[Source]: + def read_bib(self, fname: PathType = 'sources.bib') -> list[Source]: + """Read a BibTeX file.""" bib = simplepybtex.database.parse_string(self.read(fname), bib_format='bibtex') return [Source.from_entry(k, e) for k, e in bib.entries.items()] - def ods2csv(self, - fname: typing.Union[str, pathlib.Path], - outdir: typing.Optional[pathlib.Path] = None) -> typing.Dict[str, pathlib.Path]: + def ods2csv(self, fname: PathType, outdir: Optional[pathlib.Path] = None) -> PathDictType: """ Dump the data from an OpenDocument Spreadsheet (suffix .ODS) file to CSV. @@ -234,17 +251,13 @@ def ods2csv(self, res = {} for table in tables: table_name = table.attributes[ODF_NS_TABLE, 'name'] - csv_path = outdir / '{}.{}.csv'.format( - fname.stem, - slug(table_name, lowercase=False)) + csv_path = outdir / f'{fname.stem}.{slug(table_name, lowercase=False)}.csv' with dsv.UnicodeWriter(csv_path) as writer: writer.writerows(_ods_to_list(table)) res[table_name] = csv_path return res - def xls2csv(self, - fname: typing.Union[str, pathlib.Path], - outdir: typing.Optional[pathlib.Path] = None) -> typing.Dict[str, pathlib.Path]: + def xls2csv(self, fname: PathType, outdir: Optional[pathlib.Path] = None) -> PathDictType: """ Dump the data from an Excel XLS file to CSV. @@ -263,7 +276,7 @@ def xls2csv(self, wb = xlrd.open_workbook(str(fname)) except xlrd.biffh.XLRDError as e: if 'xlsx' in str(e): - raise ValueError('To read xlsx files, call xlsx2csv!') + raise ValueError('To read xlsx files, call xlsx2csv!') from e raise # pragma: no cover for sname in wb.sheet_names(): sheet = wb.sheet_by_name(sname) @@ -275,9 +288,7 @@ def xls2csv(self, res[sname] = path return res - def xlsx2csv(self, - fname: typing.Union[str, pathlib.Path], - outdir: typing.Optional[pathlib.Path] = None) -> typing.Dict[str, pathlib.Path]: + def xlsx2csv(self, fname: PathType, outdir: Optional[pathlib.Path] = None) -> PathDictType: """ Dump the data from an Excel XLSX file to CSV. @@ -297,8 +308,8 @@ def _excel_value(x): # Since Excel does not have an integer type, integers are rendered as "n.0", # which in turn confuses type detection of tools like csvkit. Thus, we normalize # numbers of the form "n.0" to "n". - return '{0}'.format(int(x)) # pragma: no cover - return '{0}'.format(x).strip() + return f'{int(x)}' # pragma: no cover + return f'{x}'.strip() fname = self._path(fname) res = {} @@ -314,10 +325,7 @@ def _excel_value(x): return res @contextlib.contextmanager - def temp_download(self, - url: str, - fname: typing.Union[str, pathlib.Path], - log=None) -> pathlib.Path: + def temp_download(self, url: str, fname: PathType, log: LogType = None) -> pathlib.Path: """ Context manager to use when downloaded data needs to be manipulated before storage \ (e.g. to anonymize it). @@ -337,22 +345,27 @@ def temp_download(self, if p and p.exists(): p.unlink() - def download(self, - url: str, - fname: typing.Union[str, pathlib.Path], - log=None, - skip_if_exists=False): + def download( + self, + url: str, + fname: PathType, + log: LogType = None, + skip_if_exists: bool = False, + ) -> pathlib.Path: """ Download data from a URL to the directory. """ p = self._path(fname) if p.exists() and skip_if_exists: return p - res = get_url(url, log=log, stream=True) - with p.open('wb') as fp: - for chunk in res.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - fp.write(chunk) + + with urlopen(url) as fp: + if log: + blue = functools.partial(colored, 'blue') + level = log.info if fp.status == 200 else log.warning + level(f'HTTP {blue(fp.status)} for {blue(url)}') + p.write_bytes(fp.read()) + return p def download_and_unpack(self, url: str, *paths: str, **kw): diff --git a/src/cldfbench/dataset.py b/src/cldfbench/dataset.py index 541a040..5f91f8d 100644 --- a/src/cldfbench/dataset.py +++ b/src/cldfbench/dataset.py @@ -1,21 +1,20 @@ +""" +A cldfbench Dataset provides scaffolding to automatically create one or more CLDF Datasets. +""" import sys -import typing +from typing import Union, Optional import inspect import pathlib import logging import argparse +import functools import importlib import subprocess -from datetime import datetime -try: # pragma: no cover - from datetime import UTC - now = lambda: datetime.now(UTC) # noqa: E731 -except ImportError: # pragma: no cover - now = lambda: datetime.utcnow() # noqa: E731 +from collections.abc import Generator import pycldf from clldutils.path import sys_path -from clldutils.misc import lazyproperty, nfilter +from clldutils.misc import nfilter from cldfcatalog import Repository from cldfbench.cldf import CLDFSpec, CLDFWriter @@ -23,13 +22,17 @@ from cldfbench.metadata import Metadata from cldfbench.ci import build_status_badge from cldfbench.util import get_entrypoints +from cldfbench._compat import utcnow __all__ = ['iter_datasets', 'get_dataset', 'get_datasets', 'Dataset', 'ENTRY_POINT'] ENTRY_POINT = 'cldfbench.dataset' NOOP = -1 +PathType = Union[str, pathlib.Path] +SpecDictKeyType = Union[str, None] +SpecDictType = dict[SpecDictKeyType, CLDFSpec] -class Dataset(object): +class Dataset: """ A cldfbench dataset ties together @@ -64,9 +67,9 @@ def __init__(self): self.metadata.id = self.id def __str__(self): - return '{0.__class__.__name__} "{0.id}" at {1}'.format(self, self.dir.resolve()) + return f'{self.__class__.__name__} "{self.id}" at {self.dir.resolve()}' - @lazyproperty + @functools.cached_property def cldf_dir(self) -> DataDir: """ Directory where CLDF data generated from the Dataset will be stored (unless specified @@ -74,21 +77,21 @@ def cldf_dir(self) -> DataDir: """ return self.dir / 'cldf' - @lazyproperty + @functools.cached_property def raw_dir(self) -> DataDir: """ Directory where cldfbench expects the raw or source data. """ return self.dir / 'raw' - @lazyproperty + @functools.cached_property def etc_dir(self) -> DataDir: """ Directory where cldfbench expects additional configuration or metadata. """ return self.dir / 'etc' - def cldf_specs(self) -> typing.Union[CLDFSpec, typing.Dict[str, CLDFSpec]]: + def cldf_specs(self) -> Union[CLDFSpec, SpecDictType]: """ A `Dataset` must declare all CLDF datasets that are derived from it. @@ -99,7 +102,7 @@ def cldf_specs(self) -> typing.Union[CLDFSpec, typing.Dict[str, CLDFSpec]]: return CLDFSpec(dir=self.cldf_dir) @property - def cldf_specs_dict(self) -> typing.Dict[typing.Union[str, None], CLDFSpec]: + def cldf_specs_dict(self) -> SpecDictType: """ Turn :meth:`cldf_specs` into a `dict` for simpler lookup. @@ -116,13 +119,14 @@ def update_submodules(self): Convenience method to be used in a `Dataset`'s `cmd_download` to update raw data curated as git submodules. """ - subprocess.check_call( - 'git -C {} submodule update --remote'.format(self.dir.resolve()), shell=True) - - def cldf_writer(self, - args: argparse.Namespace, - cldf_spec: typing.Optional[typing.Union[str, CLDFSpec]] = None, - clean: bool = True) -> CLDFWriter: + subprocess.check_call(f'git -C {self.dir.resolve()} submodule update --remote', shell=True) + + def cldf_writer( + self, + args: argparse.Namespace, + cldf_spec: Union[CLDFSpec, SpecDictKeyType] = None, + clean: bool = True, + ) -> CLDFWriter: """ :param args: Namespace passed in when initializing the `CLDFWriter` instance. :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs` @@ -137,8 +141,10 @@ def cldf_writer(self, cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_writer(args=args, dataset=self, clean=clean) - def cldf_reader(self, - cldf_spec: typing.Union[str, CLDFSpec, None] = None) -> pycldf.Dataset: + def cldf_reader( + self, + cldf_spec: Union[CLDFSpec, SpecDictKeyType] = None, + ) -> pycldf.Dataset: """ :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs`. :return: a `pycldf.Dataset` instance, for read-access to the CLDF data. @@ -147,31 +153,31 @@ def cldf_reader(self, cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_dataset() - @lazyproperty - def repo(self) -> typing.Union[Repository, None]: + @functools.cached_property + def repo(self) -> Optional[Repository]: """ The git repository cloned to the dataset's directory (or `None`). """ try: return Repository(self.dir) except ValueError: # pragma: no cover - return + return None def _cmd_download(self, args): self.raw_dir.mkdir(exist_ok=True) self.cmd_download(args) (self.raw_dir / 'README.md').write_text( - 'Raw data downloaded {0}'.format(now().isoformat()), encoding='utf8') + f'Raw data downloaded {utcnow().isoformat()}', encoding='utf8') def cmd_download(self, args: argparse.Namespace): """ Implementations of this methods should populate the dataset's `raw_dir` with the source data. """ - args.log.warning('cmd_{0} not implemented for dataset {1}'.format('download', self.id)) + args.log.warning('cmd_download not implemented for dataset %s', self.id) return NOOP - def _cmd_readme(self, args): + def _cmd_readme(self, args: argparse.Namespace): if self.metadata: badge = build_status_badge(self) md = self.cmd_readme(args) @@ -184,22 +190,21 @@ def _cmd_readme(self, args): lines.extend(['', badge]) md = '\n'.join(lines) + rel_cldf_dir = self.cldf_dir.resolve().relative_to(self.dir.resolve()) section = [ '\n\n## CLDF Datasets\n', - 'The following CLDF datasets are available in [{0}]({0}):\n'.format( - self.cldf_dir.resolve().relative_to(self.dir.resolve()) - ) + f'The following CLDF datasets are available in [{rel_cldf_dir}]({rel_cldf_dir}):\n' ] for ds in self.cldf_specs_dict.values(): if ds.metadata_path.exists(): - p = ds.metadata_path.resolve().relative_to(self.dir.resolve()) - section.append( - '- CLDF [{0}](https://github.com/cldf/cldf/tree/master/modules/{0}) ' - 'at [{1}]({1})'.format(ds.module, p)) + rel_p = ds.metadata_path.resolve().relative_to(self.dir.resolve()) + module_link = (f'[{ds.module}](https://github.com/cldf/cldf/tree/master' + f'/modules/{ds.module})') + section.append(f'- CLDF {module_link} at [{rel_p}]({rel_p})') self.dir.joinpath('README.md').write_text(md + '\n'.join(section), encoding='utf8') - def cmd_readme(self, args: argparse.Namespace) -> str: + def cmd_readme(self, _: argparse.Namespace) -> str: """ Implementations of this method should create the content for the dataset's README.md and return it as markdown formatted string. @@ -228,25 +233,25 @@ def cmd_makecldf(self, args: argparse.Namespace): :param args: An `argparse.Namespace` including attributes: \ - `writer`: :class:`CLDFWriter` instance """ - args.log.warning('cmd_{0} not implemented for dataset {1}'.format('makecldf', self.id)) + args.log.warning('cmd_makecldf not implemented for dataset %s', self.id) return NOOP -def iter_datasets(ep: str = ENTRY_POINT) -> typing.Generator[Dataset, None, None]: +def iter_datasets(ep: str = ENTRY_POINT) -> Generator[Dataset, None, None]: """ Yields `Dataset` instances registered for the specified entry point. :param ep: Name of the entry point. """ - for ep in get_entrypoints(ep): + for p in get_entrypoints(ep): try: - cls = ep.load() + cls = p.load() yield cls() # yield an initialized `Dataset` object. except ImportError as e: # pragma: no cover - logging.getLogger('cldfbench').warning('Error importing {0}: {1}'.format(ep.name, e)) + logging.getLogger('cldfbench').warning('Error importing %s: %s', p.name, e) -def get_dataset(spec, ep=ENTRY_POINT) -> Dataset: +def get_dataset(spec, ep: str = ENTRY_POINT) -> Optional[Dataset]: """ Get an initialised `Dataset` instance. @@ -264,9 +269,10 @@ def get_dataset(spec, ep=ENTRY_POINT) -> Dataset: ds = dataset_from_module(spec) if ds: return ds + return None -def get_datasets(spec, ep=ENTRY_POINT, glob: bool = False) -> typing.List[Dataset]: +def get_datasets(spec, ep=ENTRY_POINT, glob: bool = False) -> list[Dataset]: """ :param spec: Either `'*'` to get all datasets for a specific entry point, or glob pattern \ matching dataset modules in the current directory (if `glob == True`), or a `str` as accepted \ @@ -279,7 +285,7 @@ def get_datasets(spec, ep=ENTRY_POINT, glob: bool = False) -> typing.List[Datase return nfilter([get_dataset(spec, ep=ep)]) -def dataset_from_module(path) -> typing.Union[Dataset, None]: +def dataset_from_module(path: PathType) -> Optional[Dataset]: """ load the first `Dataset` subclass found in the module which does not have any subclasses. """ @@ -294,3 +300,4 @@ def dataset_from_module(path) -> typing.Union[Dataset, None]: for _, obj in inspect.getmembers(mod): if inspect.isclass(obj) and issubclass(obj, Dataset) and not obj.__subclasses__(): return obj() + return None diff --git a/src/cldfbench/metadata.py b/src/cldfbench/metadata.py index 24a6679..d6b944e 100644 --- a/src/cldfbench/metadata.py +++ b/src/cldfbench/metadata.py @@ -2,11 +2,11 @@ Dataset metadata """ import json -import collections import pathlib -import typing +from typing import Optional +import collections +import dataclasses -import attr from clldutils import licenses from clldutils.misc import nfilter from clldutils.markup import iter_markdown_tables @@ -342,8 +342,8 @@ } -@attr.s -class Metadata(object): +@dataclasses.dataclass +class Metadata: """ Dataset metadata is used as follows: @@ -358,23 +358,23 @@ class Metadata(object): - add more `attr.ib` s, - register the subclass with the dataset by assigning it to `cldfbench.Dataset.metadata_cls`. """ - id = attr.ib( + id: str = dataclasses.field( default=None, - metadata=dict(elicit=True, required=True)) - title = attr.ib( + metadata=dict(elicit=True, required=True)) # pylint: disable=R1735 + title: str = dataclasses.field( default=None, - metadata=dict(elicit=True, required=True)) - description = attr.ib( + metadata=dict(elicit=True, required=True)) # pylint: disable=R1735 + description: str = dataclasses.field( default=None) - license = attr.ib( + license: str = dataclasses.field( default=None, - metadata=dict(elicit=True, required=True)) - url = attr.ib( + metadata=dict(elicit=True, required=True)) # pylint: disable=R1735 + url: str = dataclasses.field( default=None, - metadata=dict(elicit=True)) - citation = attr.ib( + metadata=dict(elicit=True)) # pylint: disable=R1735 + citation: str = dataclasses.field( default=None, - metadata=dict(elicit=True, required=True)) + metadata=dict(elicit=True, required=True)) # pylint: disable=R1735 @classmethod def elicit(cls) -> 'Metadata': @@ -382,10 +382,10 @@ def elicit(cls) -> 'Metadata': Factory method, called when creating a new dataset directory. """ kw = {} - for field in attr.fields(cls): + for field in dataclasses.fields(cls): if field.metadata.get('elicit', False): - res = input('{0}: '.format(field.name)) - if (not res) and field.default is not attr.NOTHING: + res = input(f'{field.name}: ') + if (not res) and field.default: res = field.default kw[field.name] = res return cls(**kw) @@ -397,25 +397,33 @@ def from_file(cls, fname: pathlib.Path) -> 'Metadata': """ with fname.open('r', encoding='utf-8') as fp: try: - return cls(**json.load(fp)) + fields = {f.name for f in dataclasses.fields(cls)} + return cls(**{k: v for k, v in json.load(fp).items() if k in fields}) except json.decoder.JSONDecodeError as e: # pragma: no cover - raise ValueError('Invalid JSON file: {}\n{}'.format(fname.resolve(), e)) + raise ValueError(f'Invalid JSON file: {fname.resolve()}\n{e}') from e def write(self, fname: pathlib.Path): + """Dump the metadata as JSON to disk.""" with fname.open('w', encoding='utf-8') as fp: - return json.dump(attr.asdict(self), fp, indent=4) + return json.dump(dataclasses.asdict(self), fp, indent=4) @property - def known_license(self) -> typing.Union[None, licenses.License]: + def known_license(self) -> Optional[licenses.License]: + """ + A known license - if one can be matched to self.license. + """ if self.license: return licenses.find(self.license) + return None # pragma: no cover @property - def zenodo_license(self) -> str: + def zenodo_license(self) -> Optional[str]: + """A license ID suitable for inclusion in metadata for Zenodo.""" if self.known_license and self.known_license.id in LICENSES: return self.known_license.id + return None # pragma: no cover - def common_props(self) -> typing.Dict[str, object]: + def common_props(self) -> collections.OrderedDict[str, str]: """ The metadata as JSON-LD object suitable for inclusion in CLDF metadata. """ @@ -435,13 +443,14 @@ def common_props(self) -> typing.Dict[str, object]: return res def markdown(self) -> str: + """A human-readable version of the metadata formatted as Markdown.""" lines = [ - '# {0}\n'.format(self.title or 'Dataset {0}'.format(self.id)), + '# ' + (self.title or f'Dataset {self.id}') + '\n', '## How to cite\n\nIf you use these data please cite', ] if self.citation: lines.append('- the original source') - lines.extend([" > {}".format(line) for line in self.citation.split('\n')]) + lines.extend([f" > {line}" for line in self.citation.split('\n')]) lines.extend([ "- the derived dataset using the DOI of the " "[particular released version](../../releases/) you were using" @@ -455,18 +464,25 @@ def markdown(self) -> str: lines.append('\n## Description\n\n') if self.description: - lines.append('{0}\n'.format(self.description)) + lines.append(f'{self.description}\n') if self.license: - lines.append('This dataset is licensed under a %s license\n' % self.license) + lines.append(f'This dataset is licensed under a {self.license} license\n') if self.url: - lines.append('Available online at %s\n' % self.url) + lines.append(f'Available online at {self.url}\n') return '\n'.join(lines) -def get_creators_and_contributors(text, strict=True) -> typing.Tuple[list, list]: +TableRowsType = list[dict[str, str]] + + +def get_creators_and_contributors( + text: str, + strict: bool = True, +) -> tuple[TableRowsType, TableRowsType]: + """Read contributor info from a markdown formatted table.""" ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES} creators, contributors = [], [] # Read first table in CONTRIBUTORS.md diff --git a/src/cldfbench/scaffold.py b/src/cldfbench/scaffold.py index 944dd88..c0c2660 100644 --- a/src/cldfbench/scaffold.py +++ b/src/cldfbench/scaffold.py @@ -13,35 +13,33 @@ import shutil import pathlib import warnings +import dataclasses +from collections.abc import Generator -import attr - -import cldfbench from cldfbench.metadata import Metadata from cldfbench.util import get_entrypoints __all__ = ['Template'] -def iter_scaffolds(): +def iter_scaffolds() -> Generator[tuple[str, type], None, None]: + """Yield registered cldfbench templates.""" yield 'cldfbench', Template for ep in get_entrypoints('cldfbench.scaffold'): try: # pragma: no cover yield ep.name, ep.load() - except Exception as e: # pragma: no cover - warnings.warn( - '{0} loading cldfbench.scaffold {1}: {2}'.format( - e.__class__.__name__, ep.name, e)) + except Exception as e: # pragma: no cover # pylint: disable=W0718 + warnings.warn(f'{e.__class__.__name__} loading cldfbench.scaffold {ep.name}: {e}') -class Template(object): +class Template: # pylint: disable=R0903 """A CLDF dataset suitable for curation in a GitHub repository""" - prefix = cldfbench.__name__ - package = cldfbench.__name__ + prefix = 'cldfbench' + package = 'cldfbench' # To overwite individual template files, provide a secondary template directory which # contains only the specialized template files. - dirs = [pathlib.Path(cldfbench.__file__).parent / 'dataset_template'] + dirs = [pathlib.Path(__file__).parent / 'dataset_template'] id_pattern = re.compile('[a-z_0-9]+$') """ @@ -54,20 +52,24 @@ class Template(object): - assign the derived class to your template's `metadata` attribute. E.g. - >>> @attr.s + >>> @dataclasses.dataclass ... class CustomMetadata(Metadata): - ... custom_var = attr.ib(default=None, metadata=dict(elicit=True)) + ... custom_var: str = dataclasses.field(default=None, metadata=dict(elicit=True)) ... >>> class CustomTemplate(Template): ... metadata = CustomMetadata """ metadata = Metadata - def render(self, outdir, metadata): - # The cli will have used the class in `self.metadata` to elicit info from the user, - # and pass `self.metadata(...)` as `metadata` + def render(self, outdir: pathlib.Path, metadata: Metadata): + """ + .. note:: + + The cli will have used the class in `self.metadata` to elicit info from the user, + and pass `self.metadata(...)` as `metadata` + """ - ctx = attr.asdict(metadata) + ctx = dataclasses.asdict(metadata) ctx.update(prefix=self.prefix, package=self.package) if outdir.name != ctx['id']: outdir = outdir / ctx['id'] diff --git a/src/cldfbench/util.py b/src/cldfbench/util.py index 994eb48..03436a1 100644 --- a/src/cldfbench/util.py +++ b/src/cldfbench/util.py @@ -1,29 +1,52 @@ +""" +Utilities. +""" import sys import pathlib +import platform import subprocess import importlib.metadata -import platform +from typing import Literal, Union +from collections.abc import Iterable, Generator + +import termcolor + +from ._compat import entry_points_select -def get_entrypoints(group): - eps = importlib.metadata.entry_points() - return eps.select(group=group) if hasattr(eps, 'select') else eps.get(group, []) +def colored(color: Literal['red', 'blue'], text, **kw): + """Make termcolor.colored amenable to currying via functools.partial.""" + return termcolor.colored(text, color, **kw) -def iter_aligned(pairs, prefix=''): +def get_entrypoints(group: str) -> Iterable[importlib.metadata.EntryPoint]: + """Get registered entry points for a group.""" + return entry_points_select(importlib.metadata.entry_points(), group) + + +def iter_aligned( + pairs: Iterable[Union[tuple[str, str], list[str]]], + prefix: str = '', + minspace: int = 1, +) -> Generator[str, None, None]: + """ + >>> print("\n".join(iter_aligned([('abc', '12'), ('x', '1234')], prefix='+'))) + +abc 12 + +x 1234 + """ pairs = list(pairs) # make sure we can iterate twice over `pairs` if pairs: - maxlabel = max(len(p[0]) for p in pairs) + maxlabel = max(len(p[0]) for p in pairs) + minspace for p in pairs: - yield '{0}{1} {2}'.format(prefix, p[0].ljust(maxlabel), p[1] or '') + yield f"{prefix}{p[0].ljust(maxlabel)}{p[1] or ''}" -def iter_requirements(): +def iter_requirements() -> Generator[str, None, None]: """ :return: generator of lines in pip's requirements.txt format, specifying packages which are \ imported in the current python process. """ - imported = set(m.split('.')[0].lower() for m in sys.modules.keys()) + imported = set(m.split('.')[0].lower() for m in sys.modules) pip = pathlib.Path(sys.executable).parent / 'pip' if platform.system() == "Windows": @@ -39,8 +62,8 @@ def iter_requirements(): try: installed = subprocess.check_output([str(pip), 'freeze']) - except subprocess.CalledProcessError: # pragma: no cover - raise ValueError() + except subprocess.CalledProcessError as e: # pragma: no cover + raise ValueError() from e for req in installed.decode('utf-8').split('\n'): if '==' in req: diff --git a/tests/fixtures/module_media_local.py b/tests/fixtures/module_media_local.py new file mode 100644 index 0000000..cddf9cc --- /dev/null +++ b/tests/fixtures/module_media_local.py @@ -0,0 +1,35 @@ +from cldfbench import Dataset, CLDFSpec + + +class t_a: + name = 'origin' + url = 'https://github.com/lexibank/dataset.git' + + +class t_b: + remotes = [t_a()] + + +class t_c: + repo = t_b() + url = 'https://github.com/lexibank/dataset.git' + + def json_ld(self): + pass # pragma: no cover + + +class Thing(Dataset): + id = 'medialocal' + repo = t_c() + + def cldf_specs(self): # pragma: no cover + return {None: Dataset.cldf_specs(self)} + + def cmd_makecldf(self, args): # pragma: no cover + args.writer.cldf.add_component('MediaTable') + args.writer.objects['MediaTable'].append( + {'ID': '12345', 'Download_URL': 'Generic-metadata.json', 'Media_Type': 'application/json'} + ) + args.writer.objects['MediaTable'].append( + {'ID': '12345', 'Download_URL': 'Generix-metadata.json', 'Media_Type': 'application/json'} + ) diff --git a/tests/test_catalogs.py b/tests/test_catalogs.py index 91f7a25..212fee4 100644 --- a/tests/test_catalogs.py +++ b/tests/test_catalogs.py @@ -1,6 +1,9 @@ +import pytest + from cldfbench.catalogs import * +@pytest.mark.with_catalog def test_Glottolog(glottolog_dir): cat = Glottolog(glottolog_dir) assert cat.api.languoids(ids=['abcd1234']) @@ -13,6 +16,7 @@ def test_Glottolog(glottolog_dir): assert 'abcd1234' in cat.api.macroareas_by_glottocode -def testConcepticon(concepticon_dir): +@pytest.mark.with_catalog +def test_Concepticon(concepticon_dir): cat = Concepticon(concepticon_dir) _ = cat.api.cached_glosses diff --git a/tests/test_cldf.py b/tests/test_cldf.py index fbb9939..f886f37 100644 --- a/tests/test_cldf.py +++ b/tests/test_cldf.py @@ -24,8 +24,6 @@ def test_cldf_spec(tmp_path): def test_cldf(tmp_path): - from cldfbench.cldf import WITH_ZIPPED - with pytest.raises(AttributeError): _ = CLDFWriter().cldf @@ -44,7 +42,7 @@ def test_cldf(tmp_path): writer['ValueTable', 'value'].separator = '|' writer.objects['ValueTable'].append( dict(ID=1, Language_ID='l', Parameter_ID='p', Value=[1, 2])) - assert (not WITH_ZIPPED) or tmp_path.joinpath('data.csv.zip').exists() + assert tmp_path.joinpath('data.csv.zip').exists() ds = Dataset.from_metadata(tmp_path / 'StructureDataset-metadata.json') values = list(ds['ValueTable']) assert len(values) == 1 diff --git a/tests/test_cli.py b/tests/test_cli.py index 1c4f2b4..770b607 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -29,10 +29,20 @@ def tmpds_media(fixtures_dir, tmp_path): return tmp_path / 'module_media.py' +@pytest.fixture +def tmpds_media2(fixtures_dir, tmp_path): + for p in fixtures_dir.iterdir(): + if p.is_file(): + shutil.copy(p, tmp_path / p.name) + return tmp_path / 'module_media_local.py' + + def _main(cmd, **kw): + kw.setdefault('log', logging.getLogger(__name__)) return cli.main(shlex.split('--no-config ' + cmd), **kw) +@pytest.mark.with_catalog def test_get_cldf_dataset(tmp_path, tmpds, glottolog_dir): vals = tmp_path.joinpath('values.csv') vals.write_text('ID,Language_ID,Parameter_ID,Value\n1,1,1,1', encoding='utf8') @@ -48,6 +58,7 @@ def test_get_cldf_dataset(tmp_path, tmpds, glottolog_dir): assert ds.module == 'StructureDataset' +@pytest.mark.with_catalog def test_cldfreadme(tmp_path, tmpds, glottolog_dir): _main('makecldf ' + str(tmpds) + ' --with-zenodo --with-cldfreadme --glottolog ' + str(glottolog_dir)) @@ -61,6 +72,7 @@ def test_help(capsys): assert 'usage' in out +@pytest.mark.with_catalog def test_misc(tmp_path, mocker, glottolog_dir): with pytest.raises(SystemExit): _main('new --template=xyz') @@ -100,6 +112,7 @@ def test_run(caplog, tmpds): _main('run ' + str(tmpds) + ' raise') +@pytest.mark.with_catalog def test_readme(tmpds, tmp_path, glottolog_dir, mocker): _main('readme ' + str(tmpds)) _main('makecldf ' + str(tmpds) + ' --glottolog ' + str(glottolog_dir)) @@ -139,6 +152,7 @@ def test_download(tmpds): _main('download abc') +@pytest.mark.with_catalog def test_catinfo(capsys, glottolog_dir): _main('catinfo --glottolog {0}'.format(glottolog_dir)) out, _ = capsys.readouterr() @@ -172,12 +186,14 @@ def test_catalog_from_config(glottolog_dir, tmpds, mocker, tmp_path, fixtures_di cli.main(['makecldf', str(tmpds)]) +@pytest.mark.with_catalog def test_workflow(tmpds, glottolog_dir): _main('makecldf ' + str(tmpds) + ' --glottolog ' + str(glottolog_dir)) assert _main('check ' + str(tmpds) + ' --with-validation', log=logging.getLogger(__name__)) == 1 _main('geojson ' + str(tmpds)) +@pytest.mark.with_catalog def test_diff(tmpds, mocker, caplog, glottolog_dir, csvw3): class Item: def __init__(self, p): @@ -223,6 +239,7 @@ def test_check(tmpds, tmp_path): assert _main('check ' + str(tmpds), log=logging.getLogger(__name__)) == 0 +@pytest.mark.with_catalog def test_media(tmpds_media, tmp_path, glottolog_dir, capsys, mocker): releasedir = pathlib.Path('thing_{}'.format(MEDIA)) zipfile_name = pathlib.Path('{}.zip'.format(MEDIA)) @@ -248,15 +265,27 @@ def urlretrieve(*args): assert 'application/pdf' not in capturedout with pytest.raises(SystemExit): - _main('media -m wav --create-release -p 10.5072/zenodo.710757 ' + str(tmpds_media)) - with pytest.raises(SystemExit): - _main('media --create-release --update-zendo ' + str(tmpds_media)) + _main('media -m wav -p 10.5072/zenodo.710757 ' + str(tmpds_media)) with pytest.raises(SystemExit): - _main('media --create-release ' + str(tmpds_media)) + _main('media ' + str(tmpds_media)) - _main('media -o ' + str(tmp_path) + ' -m wav --create-release -p 10.5281/zenodo.4350882 ' + str(tmpds_media)) + _main('media -o ' + str(tmp_path) + ' -m wav -p 10.5281/zenodo.4350882 ' + str(tmpds_media)) assert (tmp_path / MEDIA / INDEX_CSV).exists() assert (tmp_path / MEDIA / wav_name[:2] / wav_name).exists() assert (tmp_path / releasedir / zipfile_name).exists() assert (tmp_path / releasedir / 'README.md').exists() assert (tmp_path / releasedir / ZENODO_FILE_NAME).exists() + + +@pytest.mark.with_catalog +def test_media2(tmpds_media2, tmp_path, glottolog_dir, capsys): + _main('makecldf ' + str(tmpds_media2) + ' --glottolog ' + str(glottolog_dir)) + + _main('media -l ' + str(tmpds_media2)) + capturedout = capsys.readouterr().out + assert 'application/json' in capturedout + + _main('media -o ' + str(tmp_path) + ' -p 10.5281/zenodo.4350882 ' + str(tmpds_media2)) + assert (tmp_path / MEDIA / INDEX_CSV).exists() + assert 'local_path' in (tmp_path / MEDIA / INDEX_CSV).read_text(encoding='utf8') + assert (tmp_path / MEDIA / '12' / '12345.json').exists() diff --git a/tests/test_datadir.py b/tests/test_datadir.py index 8a31962..ce275ae 100644 --- a/tests/test_datadir.py +++ b/tests/test_datadir.py @@ -1,6 +1,9 @@ +import logging import sys import gzip import shutil +import contextlib +import urllib.error import pytest @@ -15,9 +18,13 @@ def datadir(tmp_path, fixtures_dir): return DataDir(tmp_path) -def test_get_url(mocker): - mocker.patch('cldfbench.datadir.requests', mocker.Mock(get=mocker.Mock())) - get_url(None, log=mocker.Mock(warn=mocker.Mock())) +@pytest.mark.with_internet +def test_urlopen(): + try: + with urlopen('https://httpbin.org/delay/2', timeout=0.01) as res: + assert res.status in (404, 201) # pragma: no cover + except urllib.error.URLError as e: + assert ('timed out' in str(e)) or ('failure in name resolution' in str(e)) def test_datadir(datadir): @@ -81,14 +88,18 @@ def test_datadir_ods(datadir): assert len(data3) == 4 -def test_datadir_download_and_unpack(datadir, mocker): - mocker.patch( - 'cldfbench.datadir.get_url', - mocker.Mock( - return_value=mocker.Mock( - iter_content=mocker.Mock( - return_value=[datadir.joinpath('test.zip').open('rb').read()])))) - datadir.download_and_unpack(None) +def test_datadir_download_and_unpack(datadir, mocker, caplog): + @contextlib.contextmanager + def mock_urlopen(*args, **kw): + yield mocker.Mock(status=201, read=lambda: datadir.joinpath('test.zip').open('rb').read()) + + mocker.patch('cldfbench.datadir.urlopen', mock_urlopen) + datadir.download_and_unpack('') assert datadir.joinpath('setup.py').exists() - datadir.download(None, 'fname') - datadir.download(None, 'fname', skip_if_exists=True) + with caplog.at_level(logging.INFO): + datadir.download('x', 'fname', log=logging.getLogger(__name__)) + assert len(caplog.records) == 1 + assert 'x' in caplog.records[0].message + assert caplog.records[0].levelname == 'warning'.upper() + + datadir.download('', 'fname', skip_if_exists=True) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index e074340..4222f36 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,3 +1,5 @@ +from clldutils.jsonlib import update + from cldfbench.metadata import * @@ -5,6 +7,8 @@ def test_Metadata_read_write(tmp_path): fname = tmp_path / 'md.json' md = Metadata() md.write(fname) + with update(fname) as d: + d['key'] = 'value' assert Metadata.from_file(fname) == md diff --git a/tests/test_scaffold.py b/tests/test_scaffold.py index c6bbee8..31e71fc 100644 --- a/tests/test_scaffold.py +++ b/tests/test_scaffold.py @@ -1,13 +1,13 @@ -import attr +import dataclasses from cldfbench.scaffold import Template, Metadata def test_custom_template(tmp_path, mocker, fixtures_dir): - @attr.s + @dataclasses.dataclass class CustomMetadata(Metadata): - id = attr.ib(default='abc', metadata=dict(elicit=True)) - custom_var = attr.ib(default='xyz', metadata=dict(elicit=True)) + id: str = dataclasses.field(default='abc', metadata=dict(elicit=True)) + custom_var: str = dataclasses.field(default='xyz', metadata=dict(elicit=True)) class Custom(Template): package = 'pylexibank'