[0-9]+)$')
MEDIA = 'media'
ZENODO_FILE_NAME = 'zenodo.json'
COMMUNITIES = ['lexibank']
@@ -73,7 +78,7 @@
DESCRIPTION = "{title}{formats}{supplement_to} {descr} {online}"
-def register(parser):
+def register(parser): # pylint: disable=C0116
add_dataset_spec(parser, multiple=True)
parser.add_argument(
'-m', '--mimetype',
@@ -104,21 +109,6 @@ def register(parser):
help='DOI to which this release refers (format 10.5281/zenodo.1234567). It is required '
'for --create-release.',
)
- parser.add_argument(
- '--create-release',
- help='Switch to create ID_{0} directory containing {0}.zip, README.md and {1} for '
- 'releasing on zenodo. Cannot be used with --update-zenodo.'.format(
- MEDIA, ZENODO_FILE_NAME), # noqa: E122
- action='store_true',
- default=False,
- )
- parser.add_argument(
- '--update-zenodo',
- help="Deposit ID (number after DOI's last slash) to update metadata by using ID_{0}/{1}. "
- "Cannot be used with --create-release.".format(
- MEDIA, ZENODO_FILE_NAME), # noqa: E122
- default=None,
- )
parser.add_argument(
'--debug',
help='Switch to work with max. 500 media files and with sandbox.zenodo for testing ONLY',
@@ -127,10 +117,7 @@ def register(parser):
)
-def _create_download_thread(url, target):
- global download_threads
- download_threads = []
-
+def _create_download_thread(url, target, download_threads):
def _download(url, target):
assert not target.exists()
urlretrieve(url, str(target))
@@ -143,212 +130,257 @@ def _download(url, target):
download_threads.append(download_thread)
-def run(args):
+@dataclasses.dataclass(frozen=True)
+class Row:
+ """A row in a media table with info about the location of the associated file."""
+ id: str
+ mimetype: str
+ data: dict[str, Any]
+ url: Optional[str] = None
+ local_path: Optional[pathlib.Path] = None
+
+ @property
+ def ext(self) -> str:
+ """Filename extension gleaned from the URL"""
+ return urlparse(self.data['URL']).path.split('.')[-1].lower()
+
+ def download(self, target: pathlib.Path, download_threads: list):
+ """Retrieve the associated media file either by copy or by doanload."""
+ if self.local_path:
+ shutil.copy(self.local_path, target)
+ else:
+ _create_download_thread(self.url, target, download_threads)
+
+
+@dataclasses.dataclass
+class MediaTableSpec:
+ """A table together with column access info."""
+ table: csvw.Table
+ id_col: str
+ media_type_col: str
+ _ds: CLDFDataset
+
+ @classmethod
+ def from_dataset(cls, ds_cldf) -> 'MediaTableSpec':
+ """
+ A dataset may contain a regular MediaTable component, or just a table with url media.csv.
+ """
+ media_table = ds_cldf.get('MediaTable', ds_cldf.get('media.csv', None))
+ if media_table is None:
+ raise ValueError() # pragma: no cover
+
+ col_names = {'Media_Type': 'mimetype', 'id': 'ID'}
+ for prop in col_names:
+ col = ds_cldf.get(('MediaTable', prop))
+ if col:
+ col_names[prop] = col.name
+ return cls(media_table, col_names['id'], col_names['Media_Type'], _ds=ds_cldf)
+
+ def __iter__(self) -> Generator[Row, None, None]:
+ for row in self.table:
+ row['URL'] = anyURI.to_string(self._ds.get_row_url(self.table, row))
+ url, local_src = row['URL'], None
+ if not row['URL'].startswith('http'):
+ url = None
+ local_src = self._ds.directory / row['URL']
+ if not local_src.exists():
+ continue
+ yield Row(row[self.id_col], row[self.media_type_col], row, url, local_src)
+
+
+def _valid_input(args) -> bool:
+ if args.parent_doi and not ZENODO_DOI_PATTERN.match(args.parent_doi):
+ args.log.error('Invalid passed DOI')
+ return False
+ if not args.list:
+ if not args.parent_doi:
+ args.log.error('The corresponding DOI is required (via --parent-doi).')
+ return False
+ return True
+
+
+@dataclasses.dataclass(frozen=True)
+class File:
+ """Metadata about a media file."""
+ path: pathlib.Path
+ mimetype: Optional[str] = None
+ size: Optional[int] = None
+
+ @functools.cached_property
+ def ext(self) -> str:
+ """Filename extension, aka suffix without the dot."""
+ return self.path.suffix.replace('.', '')
+
+ @property
+ def key(self) -> str:
+ """Filetype formatted as human-readable string."""
+ return f"{self.mimetype} ({self.ext})" if self.mimetype else None
+
+
+@dataclasses.dataclass
+class MediaDir:
+ """A container for media file metadata."""
+ path: pathlib.Path
+ files: list[File] = dataclasses.field(default_factory=list)
+ rows: list[dict[str, Any]] = dataclasses.field(default_factory=list)
+
+ def __post_init__(self):
+ self.path.mkdir(exist_ok=True)
+
+ @property
+ def index(self) -> pathlib.Path:
+ """The location of the file index."""
+ return self.path / INDEX_CSV
+
+ def write_index(self):
+ """Write the file metadata to a csv file."""
+ with UnicodeWriter(self.index) as w:
+ for i, row in enumerate(self.rows):
+ if i == 0:
+ w.writerow(row.keys())
+ w.writerow(row.values())
+
+ def add(self, row) -> pathlib.Path:
+ """Add a file and return its target path in media_dir."""
+ size = row.data.get('size')
+ d = self.path / row.id[:2]
+ f = File(d / '.'.join([row.id, row.ext]), row.mimetype, int(size) if size else None)
+ row.data['local_path'] = pathlib.Path(d.name) / f.path.name
+ self.rows.append(row.data)
+ self.files.append(f)
+ return f.path
+
+ @functools.cached_property
+ def extensions(self) -> set[str]:
+ """The set of filename extensions used for the media files in the dataset."""
+ return {f.ext for f in self.files}
+
+ def print_stats(self):
+ """Print summary stats about the media files in the dataset."""
+ size_by_mimetype = collections.Counter()
+ count_by_mimetype = collections.Counter()
+ for f in self.files:
+ size_by_mimetype[f.key] += f.size or 0
+ count_by_mimetype.update([f.key])
+
+ for k, v in size_by_mimetype.most_common():
+ print('\t'.join([k.ljust(20), str(count_by_mimetype[k]), format_size(v)]))
+
+
+def run(args): # pylint: disable=C0116
ds = get_dataset(args)
ds_cldf = ds.cldf_reader()
- release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA)
+ download_threads = []
- media_table = ds_cldf.get('MediaTable', ds_cldf.get('media.csv', None))
+ if not _valid_input(args):
+ raise ParserError
- if media_table is None: # pragma: no cover
+ try:
+ media_table = MediaTableSpec.from_dataset(ds_cldf)
+ except ValueError as e: # pragma: no cover
args.log.error('Dataset has no MediaTable or media.csv')
- raise ParserError
- if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi):
- args.log.error('Invalid passed DOI')
- raise ParserError
- if args.update_zenodo: # pragma: no cover
- if not release_dir.exists():
- args.log.error('"{0}" not found -- run --create-release first?'.format(
- release_dir))
- raise ParserError
- if not (release_dir / ZENODO_FILE_NAME).exists():
- args.log.error('"{0}" not found -- run --create-release first?'.format(
- release_dir / ZENODO_FILE_NAME))
- raise ParserError
- if args.create_release:
- args.log.error('You cannot create the release and update zenodo at the same time.')
- raise ParserError
- if args.create_release:
- if not args.parent_doi:
- args.log.error('The corresponding DOI is required (via --parent-doi).')
- raise ParserError
-
- mime_types = None
- if args.mimetype:
- mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))]
-
- size = collections.Counter()
- number = collections.Counter()
- media_dir = args.out / MEDIA
- media = []
- used_file_extensions = set()
-
- if not args.update_zenodo:
- media_dir.mkdir(exist_ok=True)
- with UnicodeWriter(media_dir / INDEX_CSV if not args.list else None) as w:
- for i, row in enumerate(tqdm.tqdm(
- [r for r in media_table], desc='Getting {0} items'.format(MEDIA))):
- row['URL'] = url = anyURI.to_string(ds_cldf.get_row_url(media_table, row))
- #
- # FIXME: Don't assume URLs without query!
- #
- f_ext = url.split('.')[-1].lower()
- if args.debug and i > 500:
- break # pragma: no cover
- if (mime_types is None) or f_ext in mime_types\
- or any(row['mimetype'].startswith(x) for x in mime_types):
- if args.list:
- m = '{0} ({1})'.format(row['mimetype'], f_ext)
- size[m] += int(row['size'])
- number.update([m])
- else:
- used_file_extensions.add(f_ext.lower())
- d = media_dir / row['ID'][:2]
- d.mkdir(exist_ok=True)
- fn = '.'.join([row['ID'], f_ext])
- target = d / fn
- row['local_path'] = pathlib.Path(row['ID'][:2]) / fn
- if i == 0:
- w.writerow(row)
- w.writerow(row.values())
- media.append(target)
- if (not target.exists()) or md5(target) != row['ID']:
- _create_download_thread(url, target)
+ raise ParserError from e
+
+ mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))] if args.mimetype else []
+ media_dir = MediaDir(args.out / MEDIA)
+
+ for i, row in enumerate(tqdm.tqdm(media_table, desc='Getting media items')):
+ if args.debug and i > 500:
+ break # pragma: no cover
+
+ if any((not mime_types,
+ row.ext in mime_types,
+ any(row.mimetype.startswith(x) for x in mime_types))):
+ target = media_dir.add(row)
+ if not args.list:
+ # We do not only list stats about the media files, but retrieve them.
+ target.parent.mkdir(exist_ok=True)
+ if (not target.exists()) or md5(target) != row.id:
+ row.download(target, download_threads)
if args.list:
- for k, v in size.most_common():
- print('\t'.join([k.ljust(20), str(number[k]), format_size(v)]))
+ media_dir.print_stats()
return
# Waiting for the download threads to finish
- if 'download_threads' in globals():
- for t in download_threads:
- t.join()
-
- if args.create_release:
- assert media_dir.exists(), 'No folder "{0}" found in {1}'.format(MEDIA, media_dir.resolve())
- release_dir.mkdir(exist_ok=True)
- media.append(media_dir / INDEX_CSV)
-
- try:
- zipf = zipfile.ZipFile(
- str(release_dir / '{0}.zip'.format(MEDIA)), 'w', zipfile.ZIP_DEFLATED)
- fp = args.out
- for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)):
- zipf.write(str(f), str(os.path.relpath(str(f), str(fp))))
- zipf.close()
- except Exception as e: # pragma: no cover
- args.log.error(e)
- raise
-
- def _contrib(d):
- return {k: v for k, v in d.items() if k in {'name', 'affiliation', 'orcid', 'type'}}
-
- version_v = git_describe('.').split('-')[0]
- version = version_v.replace('v', '')
- git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '')
- with jsonlib.update(
- release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md:
- contribs = ds.dir / 'CONTRIBUTORS.md'
- creators, contributors = get_creators_and_contributors(
- contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False)
- if creators:
- md['creators'] = [_contrib(p) for p in creators]
- if contributors:
- md['contributors'] = [_contrib(p) for p in contributors]
- communities = [r["identifier"] for r in md.get("communities", [])] + \
- [c.strip() for c in nfilter(args.communities.split(','))] + \
- COMMUNITIES
- if communities and not args.debug:
- md['communities'] = [
- {"identifier": community_id} for community_id in sorted(set(communities))]
- md.update(
- {
- 'title': '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()),
- 'access_right': 'open',
- 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])),
- 'upload_type': 'dataset',
- 'publication_date': datetime.today().strftime('%Y-%m-%d'),
- 'version': version,
- 'related_identifiers': [
- {
- 'scheme': 'url',
- 'identifier': '{0}/tree/{1}'.format(git_url, version_v),
- 'relation': 'isSupplementTo'
- },
- ],
- }
- )
- if args.parent_doi:
- md['related_identifiers'].append({
- 'scheme': 'doi', 'identifier': args.parent_doi, 'relation': 'isPartOf'})
- supplement_to = " - Supplement to dataset " \
- "{1} ".format(
- args.parent_doi, ds.metadata.title) # noqa: E122
- if ds.metadata.url:
- md['related_identifiers'].append({
- 'scheme': 'url',
- 'identifier': ds.metadata.url,
- 'relation': 'isAlternateIdentifier'})
-
- formats = ', '.join(sorted(used_file_extensions))
- descr = '
' + ds.metadata.description if ds.metadata.description else ''
- online_url, online = '', ''
- if ds.metadata.url:
- online_url = ds.metadata.url
- online = "
Available online at: {0}".format(online_url)
- md['description'] = html.escape(DESCRIPTION.format(
- url=online_url,
- formats=' ({0})'.format(formats) if formats else '',
- title=md['title'],
- supplement_to=supplement_to,
- descr=descr,
- online=online))
-
- license_md = ''
- if ds.metadata.zenodo_license:
- md['license'] = {'id': ds.metadata.zenodo_license}
- license_md = LICENCE.format(ds.metadata.zenodo_license)
-
- DataDir(release_dir).write('README.md', README.format(
- title=md['title'],
- doi='https://doi.org/{0}'.format(args.parent_doi),
- ds_title=ds.metadata.title,
- license=license_md,
- formats=' ({0})'.format(formats) if formats else '',
- media=MEDIA,
- index=INDEX_CSV))
-
- if args.update_zenodo: # pragma: no cover
- md = jsonlib.load(release_dir / ZENODO_FILE_NAME)
-
- if args.debug:
- api_url = API_URL_SANDBOX
- access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN')
- else:
- api_url = API_URL
- access_token = ACCESS_TOKEN
- zenodo_url = api_url.replace('api/', '')
-
- args.log.info('Updating Deposit ID {0} on {1} with:'.format(args.update_zenodo, zenodo_url))
- api = Zenodo(api_url=api_url, access_token=access_token)
- try:
- rec = api.record_from_id('{0}record/{1}'.format(zenodo_url, args.update_zenodo))
- except Exception as e:
- args.log.error('Check connection and credentials for accessing Zenodo.\n{0}'.format(e))
- return
- latest_version = rec.links['latest'].split('/')[-1]
- if latest_version != args.update_zenodo:
- args.log.warn('Passed deposit ID does not refer to latest version {0}!'.format(
- latest_version))
- args.log.info(' DOI: ' + rec.metadata.doi)
- args.log.info(' Title: ' + rec.metadata.title)
- args.log.info(' Version: ' + rec.metadata.version)
- args.log.info(' Date: ' + rec.metadata.publication_date)
- args.log.info(' Files: ' + ', '.join([f.key for f in rec.files]))
- p = input("Proceed? [y/N]: ")
- if p.lower() == 'y':
- dep = api.update_deposit(args.update_zenodo, **md)
- if dep.state != PUBLISHED:
- api.publish_deposit(dep)
- args.log.info('Updated successfully')
+ for t in download_threads:
+ t.join()
+
+ media_dir.write_index()
+ release_dir = args.out / f'{ds.id}_{MEDIA}'
+ release_dir.mkdir(exist_ok=True)
+ _zip_media(release_dir, [media_dir.index] + [f.path for f in media_dir.files], args)
+ _release_metadata(release_dir, ds, args, media_dir.extensions)
+
+
+def _zip_media(release_dir, media, args):
+ try:
+ with zipfile.ZipFile(release_dir / f'{MEDIA}.zip', 'w', zipfile.ZIP_DEFLATED) as zf:
+ for f in tqdm.tqdm(media, desc=f'Creating {MEDIA}.zip'):
+ zf.write(f, str(os.path.relpath(str(f), str(args.out))))
+ except Exception as e: # pragma: no cover
+ args.log.error(e)
+ raise
+
+
+def _release_metadata(release_dir, ds, args, used_file_extensions):
+ version_v = git_describe('.').split('-')[0]
+ git_url = [r for r in ds.repo.repo.remotes if r.name == 'origin'][0].url.replace('.git', '')
+ with (jsonlib.update(
+ release_dir / ZENODO_FILE_NAME, indent=4, default=collections.OrderedDict()) as md):
+ set_creators_and_contributors(ds, md)
+ communities = list(itertools.chain(
+ [r["identifier"] for r in md.get("communities", [])],
+ [c.strip() for c in nfilter(args.communities.split(','))],
+ COMMUNITIES))
+ if communities and not args.debug:
+ md['communities'] = [
+ {"identifier": community_id} for community_id in sorted(set(communities))]
+ md.update(
+ {
+ 'title': f'{ds.metadata.title} {MEDIA.title()} Files',
+ 'access_right': 'open',
+ 'keywords': sorted(set(md.get('keywords', []) + ['linguistics'])),
+ 'upload_type': 'dataset',
+ 'publication_date': datetime.today().strftime('%Y-%m-%d'),
+ 'version': version_v.replace('v', ''),
+ 'related_identifiers': [],
+ }
+ )
+ _add_rel_id(md, 'url', f'{git_url}/tree/{version_v}', 'isSupplementTo')
+
+ supplement_to = ''
+ if args.parent_doi:
+ _add_rel_id(md, 'doi', args.parent_doi, 'isPartOf')
+ supplement_to = f" - Supplement to dataset " \
+ f"{ds.metadata.title} "
+ if ds.metadata.url:
+ _add_rel_id(md, 'url', ds.metadata.url, 'isAlternateIdentifier')
+
+ formats = ', '.join(sorted(used_file_extensions))
+ md['description'] = html.escape(DESCRIPTION.format(
+ url=ds.metadata.url or '',
+ formats=' ({formats})' if formats else '',
+ title=md['title'],
+ supplement_to=supplement_to,
+ descr='
' + ds.metadata.description if ds.metadata.description else '',
+ online=f"
Available online at: "
+ f"{ds.metadata.url}" if ds.metadata.url else ''))
+
+ if ds.metadata.zenodo_license:
+ md['license'] = {'id': ds.metadata.zenodo_license}
+
+ DataDir(release_dir).write('README.md', README.format(
+ title=md['title'],
+ doi=f'https://doi.org/{args.parent_doi}',
+ ds_title=ds.metadata.title,
+ license=LICENCE.format(
+ ds.metadata.zenodo_license) if ds.metadata.zenodo_license else '',
+ formats=f' ({formats})' if formats else '',
+ media=MEDIA,
+ index=INDEX_CSV))
+
+
+def _add_rel_id(md, scheme, identifier, relation):
+ md['related_identifiers'].append(
+ {'scheme': scheme, 'identifier': identifier, 'relation': relation})
diff --git a/src/cldfbench/commands/new.py b/src/cldfbench/commands/new.py
index 71930b4..6bd37c8 100644
--- a/src/cldfbench/commands/new.py
+++ b/src/cldfbench/commands/new.py
@@ -4,19 +4,20 @@
import pathlib
import collections
-from cldfbench import scaffold
+from cldfbench.scaffold import iter_scaffolds
-_templates = None
+_templates = None # pylint: disable=invalid-name
def get_template_dict():
- global _templates
+ """Read available dataset templates."""
+ global _templates # pylint: disable=W0603
if _templates is None:
- _templates = collections.OrderedDict(scaffold.iter_scaffolds())
+ _templates = collections.OrderedDict(iter_scaffolds())
return _templates
-def register(parser):
+def register(parser): # pylint: disable=C0116
templates = list(get_template_dict().keys())
parser.add_argument(
'--template',
@@ -30,7 +31,7 @@ def register(parser):
default=pathlib.Path('.'))
-def run(args):
+def run(args): # pylint: disable=C0116
tmpl = get_template_dict()[args.template]()
md = tmpl.metadata.elicit()
tmpl.render(args.out, md)
diff --git a/src/cldfbench/commands/readme.py b/src/cldfbench/commands/readme.py
index 089ce45..925d73e 100644
--- a/src/cldfbench/commands/readme.py
+++ b/src/cldfbench/commands/readme.py
@@ -4,9 +4,9 @@
from cldfbench.cli_util import add_dataset_spec, with_datasets
-def register(parser):
+def register(parser): # pylint: disable=C0116
add_dataset_spec(parser, multiple=True)
-def run(args):
+def run(args): # pylint: disable=C0116
with_datasets(args, 'readme')
diff --git a/src/cldfbench/commands/run.py b/src/cldfbench/commands/run.py
index 45b1db7..d43306d 100644
--- a/src/cldfbench/commands/run.py
+++ b/src/cldfbench/commands/run.py
@@ -6,11 +6,11 @@
from cldfbench.cli_util import with_dataset, add_dataset_spec
-def register(parser):
+def register(parser): # pylint: disable=C0116
add_dataset_spec(parser)
parser.add_argument('cmd', help='command to run on the dataset')
parser.add_argument('args', nargs=argparse.REMAINDER)
-def run(args):
+def run(args): # pylint: disable=C0116
with_dataset(args, args.cmd)
diff --git a/src/cldfbench/commands/stub.py b/src/cldfbench/commands/stub.py
index 334eb23..904c14b 100644
--- a/src/cldfbench/commands/stub.py
+++ b/src/cldfbench/commands/stub.py
@@ -4,9 +4,9 @@
from cldfbench.cli_util import add_catalog_spec
-def register(parser):
+def register(parser): # pylint: disable=C0116
add_catalog_spec(parser, 'concepticon')
-def run(args): # pragma: no cover
- pass
+def run(args): # pragma: no cover # pylint: disable=C0116
+ print(args)
diff --git a/src/cldfbench/commands/zenodo.py b/src/cldfbench/commands/zenodo.py
index fb54082..9a7e96d 100644
--- a/src/cldfbench/commands/zenodo.py
+++ b/src/cldfbench/commands/zenodo.py
@@ -7,11 +7,10 @@
from clldutils.jsonlib import update
from clldutils.misc import nfilter
-from cldfbench.cli_util import add_dataset_spec, get_dataset
-from cldfbench.metadata import get_creators_and_contributors
+from cldfbench.cli_util import add_dataset_spec, get_dataset, set_creators_and_contributors
-def register(parser):
+def register(parser): # pylint: disable=C0116
add_dataset_spec(parser, multiple=True)
parser.add_argument(
'--communities',
@@ -20,17 +19,11 @@ def register(parser):
)
-def run(args):
+def run(args): # pylint: disable=C0116
dataset = get_dataset(args)
with update(dataset.dir / '.zenodo.json', indent=4, default=collections.OrderedDict()) as md:
modules = ['cldf:' + spec.module for spec in dataset.cldf_specs_dict.values()]
- contribs = dataset.dir / 'CONTRIBUTORS.md'
- creators, contributors = get_creators_and_contributors(
- contribs.read_text(encoding='utf8') if contribs.exists() else '', strict=False)
- if creators:
- md['creators'] = [contrib(p) for p in creators]
- if contributors:
- md["contributors"] = [contrib(p) for p in contributors]
+ set_creators_and_contributors(dataset, md)
communities = [r["identifier"] for r in md.get("communities", [])] + \
[c.strip() for c in nfilter(args.communities.split(','))]
if communities:
@@ -45,14 +38,8 @@ def run(args):
}
)
if dataset.metadata.citation:
- md['description'] = "Cite the source of the dataset as:
\n\n" \
- "\n{}
\n
".format(
- html.escape(dataset.metadata.citation))
+ md['description'] = \
+ f"Cite the source of the dataset as:
\n\n" \
+ f"\n{html.escape(dataset.metadata.citation)}
\n
"
if dataset.metadata.zenodo_license:
md['license'] = {'id': dataset.metadata.zenodo_license}
-
-
-def contrib(d):
- return {
- k: v for k, v in d.items()
- if k in {'name', 'affiliation', 'orcid', 'type'} and (v or k != 'orcid')}
diff --git a/src/cldfbench/datadir.py b/src/cldfbench/datadir.py
index ec79926..7caf31d 100644
--- a/src/cldfbench/datadir.py
+++ b/src/cldfbench/datadir.py
@@ -1,16 +1,20 @@
+"""
+Functionality to access structured data in the file system.
+"""
import gzip
import shutil
-import typing
+import logging
+from typing import Optional, Union, Literal
import pathlib
import zipfile
+import functools
import itertools
import contextlib
from xml.etree import ElementTree as et
import collections
+from collections.abc import Iterable
import unicodedata
-
-import requests
-import termcolor
+import urllib.request
try:
from odf.opendocument import load as load_odf
@@ -33,20 +37,24 @@
from clldutils import jsonlib
from pycldf.sources import Source
+from .util import colored
-__all__ = ['get_url', 'DataDir']
+__all__ = ['DataDir', 'urlopen']
+HTTP_REQUEST_TIMEOUT = 10
ODF_NS_TABLE = 'urn:oasis:names:tc:opendocument:xmlns:table:1.0'
ODF_NS_TEXT = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
+PathType = Union[str, pathlib.Path]
+PathDictType = dict[str, pathlib.Path]
+LogType = Optional[logging.Logger]
def _real_len(seq, pred=bool):
for index in range(len(seq) - 1, -1, -1):
if pred(seq[index]):
return index + 1
- else:
- return 0
+ return 0
def _ods_value(cell):
@@ -77,8 +85,7 @@ def _ods_cells(row):
def _pad_list(li, length):
if len(li) >= length:
return li
- else:
- return [e for e in itertools.chain(li, itertools.repeat('', length - len(li)))]
+ return list(itertools.chain(li, itertools.repeat('', length - len(li))))
def _ods_to_list(table):
@@ -102,20 +109,28 @@ def _ods_to_list(table):
for cloned_row in itertools.repeat(row, number)]
-def get_url(url: str, log=None, **kw) -> requests.Response:
- res = requests.get(url, **kw)
- if log:
- level = log.info if res.status_code == 200 else log.warning
- level('HTTP {0} for {1}'.format(
- termcolor.colored(res.status_code, 'blue'), termcolor.colored(url, 'blue')))
- return res
+@contextlib.contextmanager
+def urlopen(url, timeout=HTTP_REQUEST_TIMEOUT):
+ """
+ Open URLs
+ - without raising an exception on HTTP errors,
+ - passing a specific User-Agent header,
+ - specifying a timeout.
+ """
+ class NonRaisingHTTPErrorProcessor(urllib.request.HTTPErrorProcessor):
+ """Don't raise exceptions on HTTP errors."""
+ http_response = https_response = lambda self, req, res: res # pylint: disable=C3001
+
+ opener = urllib.request.build_opener(NonRaisingHTTPErrorProcessor)
+ opener.addheaders = [('User-agent', 'cldfbench/2.0.0')]
+ yield opener.open(urllib.request.Request(url), timeout=timeout)
class DataDir(type(pathlib.Path())):
"""
A `pathlib.Path` augmented with functionality to read common data formats.
"""
- def _path(self, fname: typing.Union[str, pathlib.Path]) -> pathlib.Path:
+ def _path(self, fname: PathType) -> pathlib.Path:
"""
Interpret strings without "/" as names of files in `self`.
@@ -126,12 +141,14 @@ def _path(self, fname: typing.Union[str, pathlib.Path]) -> pathlib.Path:
return self / fname
return pathlib.Path(fname)
- def read(self,
- fname: typing.Union[str, pathlib.Path],
- aname: str = None,
- normalize: str = None,
- suffix: str = None,
- encoding: str = 'utf8') -> str:
+ def read( # pylint: disable=R0913,R0917
+ self,
+ fname: PathType,
+ aname: str = None,
+ normalize: Optional[Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
+ suffix: str = None,
+ encoding: str = 'utf8',
+ ) -> str:
"""
Read text data from a file.
@@ -144,8 +161,8 @@ def read(self,
p = self._path(fname)
suffix = suffix or p.suffix
if suffix == '.zip':
- zip = zipfile.ZipFile(str(p))
- text = zip.read(aname or zip.namelist()[0]).decode(encoding)
+ with zipfile.ZipFile(str(p)) as zipf:
+ text = zipf.read(aname or zipf.namelist()[0]).decode(encoding)
elif suffix == '.gz':
with gzip.open(p) as fp:
text = fp.read().decode(encoding)
@@ -156,7 +173,7 @@ def read(self,
text = unicodedata.normalize(normalize, text)
return text
- def write(self, fname: typing.Union[str, pathlib.Path], text: str, encoding='utf8'):
+ def write(self, fname: PathType, text: str, encoding='utf8'):
"""
Write text data to a file.
@@ -165,53 +182,53 @@ def write(self, fname: typing.Union[str, pathlib.Path], text: str, encoding='utf
self._path(fname).write_text(text, encoding=encoding)
return fname
- def read_csv(self,
- fname: typing.Union[str, pathlib.Path],
- normalize=None, **kw) -> typing.List[typing.Union[dict, list]]:
+ def read_csv(
+ self,
+ fname: PathType,
+ normalize: Optional[Literal['NFC', 'NFKC', 'NFD', 'NFKD']] = None,
+ **kw,
+ ) -> list[Union[dict[str, str], list[str]]]:
"""
Read CSV data from a file.
"""
+ reader = dsv.reader(self._path(fname), **kw)
+
if not normalize:
- return list(dsv.reader(self._path(fname), **kw))
- if kw.get('dicts'):
- return [collections.OrderedDict(
- [(k, unicodedata.normalize(normalize, v)) for k, v in row.items()]
- ) for row in dsv.reader(self._path(fname), **kw)]
- else:
- return [[unicodedata.normalize(normalize, k) for k in row]
- for row in dsv.reader(self._path(fname), **kw)]
+ return list(reader)
- def write_csv(self,
- fname: typing.Union[str, pathlib.Path],
- rows: typing.Iterable[typing.List[str]], **kw):
+ norm = functools.partial(unicodedata.normalize, normalize)
+
+ if not kw.get('dicts'):
+ return [[norm(k) for k in row] for row in reader]
+
+ return [collections.OrderedDict([(k, norm(v)) for k, v in row.items()]) for row in reader]
+
+ def write_csv(self, fname: PathType, rows: Iterable[list[str]], **kw):
"""
Write CSV data to a file.
"""
with dsv.UnicodeWriter(self._path(fname), **kw) as writer:
writer.writerows(rows)
- def read_xml(self, fname: typing.Union[str, pathlib.Path], wrap=True) -> et.Element:
+ def read_xml(self, fname: PathType, wrap=True) -> et.Element:
"""
Reads and parses XML from a file.
"""
xml = xmlchars(self.read(fname))
if wrap:
- xml = '{0}'.format(xml)
+ xml = f'{xml}'
return et.fromstring(xml.encode('utf8'))
- def read_json(self,
- fname: typing.Union[str, pathlib.Path],
- **kw) -> typing.Union[str, list, dict]:
+ def read_json(self, fname: PathType, **_) -> Union[str, list, dict]:
+ """Read a JSON file."""
return jsonlib.load(self._path(fname))
- def read_bib(self,
- fname: typing.Union[str, pathlib.Path] = 'sources.bib') -> typing.List[Source]:
+ def read_bib(self, fname: PathType = 'sources.bib') -> list[Source]:
+ """Read a BibTeX file."""
bib = simplepybtex.database.parse_string(self.read(fname), bib_format='bibtex')
return [Source.from_entry(k, e) for k, e in bib.entries.items()]
- def ods2csv(self,
- fname: typing.Union[str, pathlib.Path],
- outdir: typing.Optional[pathlib.Path] = None) -> typing.Dict[str, pathlib.Path]:
+ def ods2csv(self, fname: PathType, outdir: Optional[pathlib.Path] = None) -> PathDictType:
"""
Dump the data from an OpenDocument Spreadsheet (suffix .ODS) file to CSV.
@@ -234,17 +251,13 @@ def ods2csv(self,
res = {}
for table in tables:
table_name = table.attributes[ODF_NS_TABLE, 'name']
- csv_path = outdir / '{}.{}.csv'.format(
- fname.stem,
- slug(table_name, lowercase=False))
+ csv_path = outdir / f'{fname.stem}.{slug(table_name, lowercase=False)}.csv'
with dsv.UnicodeWriter(csv_path) as writer:
writer.writerows(_ods_to_list(table))
res[table_name] = csv_path
return res
- def xls2csv(self,
- fname: typing.Union[str, pathlib.Path],
- outdir: typing.Optional[pathlib.Path] = None) -> typing.Dict[str, pathlib.Path]:
+ def xls2csv(self, fname: PathType, outdir: Optional[pathlib.Path] = None) -> PathDictType:
"""
Dump the data from an Excel XLS file to CSV.
@@ -263,7 +276,7 @@ def xls2csv(self,
wb = xlrd.open_workbook(str(fname))
except xlrd.biffh.XLRDError as e:
if 'xlsx' in str(e):
- raise ValueError('To read xlsx files, call xlsx2csv!')
+ raise ValueError('To read xlsx files, call xlsx2csv!') from e
raise # pragma: no cover
for sname in wb.sheet_names():
sheet = wb.sheet_by_name(sname)
@@ -275,9 +288,7 @@ def xls2csv(self,
res[sname] = path
return res
- def xlsx2csv(self,
- fname: typing.Union[str, pathlib.Path],
- outdir: typing.Optional[pathlib.Path] = None) -> typing.Dict[str, pathlib.Path]:
+ def xlsx2csv(self, fname: PathType, outdir: Optional[pathlib.Path] = None) -> PathDictType:
"""
Dump the data from an Excel XLSX file to CSV.
@@ -297,8 +308,8 @@ def _excel_value(x):
# Since Excel does not have an integer type, integers are rendered as "n.0",
# which in turn confuses type detection of tools like csvkit. Thus, we normalize
# numbers of the form "n.0" to "n".
- return '{0}'.format(int(x)) # pragma: no cover
- return '{0}'.format(x).strip()
+ return f'{int(x)}' # pragma: no cover
+ return f'{x}'.strip()
fname = self._path(fname)
res = {}
@@ -314,10 +325,7 @@ def _excel_value(x):
return res
@contextlib.contextmanager
- def temp_download(self,
- url: str,
- fname: typing.Union[str, pathlib.Path],
- log=None) -> pathlib.Path:
+ def temp_download(self, url: str, fname: PathType, log: LogType = None) -> pathlib.Path:
"""
Context manager to use when downloaded data needs to be manipulated before storage \
(e.g. to anonymize it).
@@ -337,22 +345,27 @@ def temp_download(self,
if p and p.exists():
p.unlink()
- def download(self,
- url: str,
- fname: typing.Union[str, pathlib.Path],
- log=None,
- skip_if_exists=False):
+ def download(
+ self,
+ url: str,
+ fname: PathType,
+ log: LogType = None,
+ skip_if_exists: bool = False,
+ ) -> pathlib.Path:
"""
Download data from a URL to the directory.
"""
p = self._path(fname)
if p.exists() and skip_if_exists:
return p
- res = get_url(url, log=log, stream=True)
- with p.open('wb') as fp:
- for chunk in res.iter_content(chunk_size=1024):
- if chunk: # filter out keep-alive new chunks
- fp.write(chunk)
+
+ with urlopen(url) as fp:
+ if log:
+ blue = functools.partial(colored, 'blue')
+ level = log.info if fp.status == 200 else log.warning
+ level(f'HTTP {blue(fp.status)} for {blue(url)}')
+ p.write_bytes(fp.read())
+
return p
def download_and_unpack(self, url: str, *paths: str, **kw):
diff --git a/src/cldfbench/dataset.py b/src/cldfbench/dataset.py
index 541a040..5f91f8d 100644
--- a/src/cldfbench/dataset.py
+++ b/src/cldfbench/dataset.py
@@ -1,21 +1,20 @@
+"""
+A cldfbench Dataset provides scaffolding to automatically create one or more CLDF Datasets.
+"""
import sys
-import typing
+from typing import Union, Optional
import inspect
import pathlib
import logging
import argparse
+import functools
import importlib
import subprocess
-from datetime import datetime
-try: # pragma: no cover
- from datetime import UTC
- now = lambda: datetime.now(UTC) # noqa: E731
-except ImportError: # pragma: no cover
- now = lambda: datetime.utcnow() # noqa: E731
+from collections.abc import Generator
import pycldf
from clldutils.path import sys_path
-from clldutils.misc import lazyproperty, nfilter
+from clldutils.misc import nfilter
from cldfcatalog import Repository
from cldfbench.cldf import CLDFSpec, CLDFWriter
@@ -23,13 +22,17 @@
from cldfbench.metadata import Metadata
from cldfbench.ci import build_status_badge
from cldfbench.util import get_entrypoints
+from cldfbench._compat import utcnow
__all__ = ['iter_datasets', 'get_dataset', 'get_datasets', 'Dataset', 'ENTRY_POINT']
ENTRY_POINT = 'cldfbench.dataset'
NOOP = -1
+PathType = Union[str, pathlib.Path]
+SpecDictKeyType = Union[str, None]
+SpecDictType = dict[SpecDictKeyType, CLDFSpec]
-class Dataset(object):
+class Dataset:
"""
A cldfbench dataset ties together
@@ -64,9 +67,9 @@ def __init__(self):
self.metadata.id = self.id
def __str__(self):
- return '{0.__class__.__name__} "{0.id}" at {1}'.format(self, self.dir.resolve())
+ return f'{self.__class__.__name__} "{self.id}" at {self.dir.resolve()}'
- @lazyproperty
+ @functools.cached_property
def cldf_dir(self) -> DataDir:
"""
Directory where CLDF data generated from the Dataset will be stored (unless specified
@@ -74,21 +77,21 @@ def cldf_dir(self) -> DataDir:
"""
return self.dir / 'cldf'
- @lazyproperty
+ @functools.cached_property
def raw_dir(self) -> DataDir:
"""
Directory where cldfbench expects the raw or source data.
"""
return self.dir / 'raw'
- @lazyproperty
+ @functools.cached_property
def etc_dir(self) -> DataDir:
"""
Directory where cldfbench expects additional configuration or metadata.
"""
return self.dir / 'etc'
- def cldf_specs(self) -> typing.Union[CLDFSpec, typing.Dict[str, CLDFSpec]]:
+ def cldf_specs(self) -> Union[CLDFSpec, SpecDictType]:
"""
A `Dataset` must declare all CLDF datasets that are derived from it.
@@ -99,7 +102,7 @@ def cldf_specs(self) -> typing.Union[CLDFSpec, typing.Dict[str, CLDFSpec]]:
return CLDFSpec(dir=self.cldf_dir)
@property
- def cldf_specs_dict(self) -> typing.Dict[typing.Union[str, None], CLDFSpec]:
+ def cldf_specs_dict(self) -> SpecDictType:
"""
Turn :meth:`cldf_specs` into a `dict` for simpler lookup.
@@ -116,13 +119,14 @@ def update_submodules(self):
Convenience method to be used in a `Dataset`'s `cmd_download` to update raw data curated
as git submodules.
"""
- subprocess.check_call(
- 'git -C {} submodule update --remote'.format(self.dir.resolve()), shell=True)
-
- def cldf_writer(self,
- args: argparse.Namespace,
- cldf_spec: typing.Optional[typing.Union[str, CLDFSpec]] = None,
- clean: bool = True) -> CLDFWriter:
+ subprocess.check_call(f'git -C {self.dir.resolve()} submodule update --remote', shell=True)
+
+ def cldf_writer(
+ self,
+ args: argparse.Namespace,
+ cldf_spec: Union[CLDFSpec, SpecDictKeyType] = None,
+ clean: bool = True,
+ ) -> CLDFWriter:
"""
:param args: Namespace passed in when initializing the `CLDFWriter` instance.
:param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs`
@@ -137,8 +141,10 @@ def cldf_writer(self,
cldf_spec = self.cldf_specs_dict[cldf_spec]
return cldf_spec.get_writer(args=args, dataset=self, clean=clean)
- def cldf_reader(self,
- cldf_spec: typing.Union[str, CLDFSpec, None] = None) -> pycldf.Dataset:
+ def cldf_reader(
+ self,
+ cldf_spec: Union[CLDFSpec, SpecDictKeyType] = None,
+ ) -> pycldf.Dataset:
"""
:param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs`.
:return: a `pycldf.Dataset` instance, for read-access to the CLDF data.
@@ -147,31 +153,31 @@ def cldf_reader(self,
cldf_spec = self.cldf_specs_dict[cldf_spec]
return cldf_spec.get_dataset()
- @lazyproperty
- def repo(self) -> typing.Union[Repository, None]:
+ @functools.cached_property
+ def repo(self) -> Optional[Repository]:
"""
The git repository cloned to the dataset's directory (or `None`).
"""
try:
return Repository(self.dir)
except ValueError: # pragma: no cover
- return
+ return None
def _cmd_download(self, args):
self.raw_dir.mkdir(exist_ok=True)
self.cmd_download(args)
(self.raw_dir / 'README.md').write_text(
- 'Raw data downloaded {0}'.format(now().isoformat()), encoding='utf8')
+ f'Raw data downloaded {utcnow().isoformat()}', encoding='utf8')
def cmd_download(self, args: argparse.Namespace):
"""
Implementations of this methods should populate the dataset's `raw_dir` with the source
data.
"""
- args.log.warning('cmd_{0} not implemented for dataset {1}'.format('download', self.id))
+ args.log.warning('cmd_download not implemented for dataset %s', self.id)
return NOOP
- def _cmd_readme(self, args):
+ def _cmd_readme(self, args: argparse.Namespace):
if self.metadata:
badge = build_status_badge(self)
md = self.cmd_readme(args)
@@ -184,22 +190,21 @@ def _cmd_readme(self, args):
lines.extend(['', badge])
md = '\n'.join(lines)
+ rel_cldf_dir = self.cldf_dir.resolve().relative_to(self.dir.resolve())
section = [
'\n\n## CLDF Datasets\n',
- 'The following CLDF datasets are available in [{0}]({0}):\n'.format(
- self.cldf_dir.resolve().relative_to(self.dir.resolve())
- )
+ f'The following CLDF datasets are available in [{rel_cldf_dir}]({rel_cldf_dir}):\n'
]
for ds in self.cldf_specs_dict.values():
if ds.metadata_path.exists():
- p = ds.metadata_path.resolve().relative_to(self.dir.resolve())
- section.append(
- '- CLDF [{0}](https://github.com/cldf/cldf/tree/master/modules/{0}) '
- 'at [{1}]({1})'.format(ds.module, p))
+ rel_p = ds.metadata_path.resolve().relative_to(self.dir.resolve())
+ module_link = (f'[{ds.module}](https://github.com/cldf/cldf/tree/master'
+ f'/modules/{ds.module})')
+ section.append(f'- CLDF {module_link} at [{rel_p}]({rel_p})')
self.dir.joinpath('README.md').write_text(md + '\n'.join(section), encoding='utf8')
- def cmd_readme(self, args: argparse.Namespace) -> str:
+ def cmd_readme(self, _: argparse.Namespace) -> str:
"""
Implementations of this method should create the content for the dataset's README.md
and return it as markdown formatted string.
@@ -228,25 +233,25 @@ def cmd_makecldf(self, args: argparse.Namespace):
:param args: An `argparse.Namespace` including attributes: \
- `writer`: :class:`CLDFWriter` instance
"""
- args.log.warning('cmd_{0} not implemented for dataset {1}'.format('makecldf', self.id))
+ args.log.warning('cmd_makecldf not implemented for dataset %s', self.id)
return NOOP
-def iter_datasets(ep: str = ENTRY_POINT) -> typing.Generator[Dataset, None, None]:
+def iter_datasets(ep: str = ENTRY_POINT) -> Generator[Dataset, None, None]:
"""
Yields `Dataset` instances registered for the specified entry point.
:param ep: Name of the entry point.
"""
- for ep in get_entrypoints(ep):
+ for p in get_entrypoints(ep):
try:
- cls = ep.load()
+ cls = p.load()
yield cls() # yield an initialized `Dataset` object.
except ImportError as e: # pragma: no cover
- logging.getLogger('cldfbench').warning('Error importing {0}: {1}'.format(ep.name, e))
+ logging.getLogger('cldfbench').warning('Error importing %s: %s', p.name, e)
-def get_dataset(spec, ep=ENTRY_POINT) -> Dataset:
+def get_dataset(spec, ep: str = ENTRY_POINT) -> Optional[Dataset]:
"""
Get an initialised `Dataset` instance.
@@ -264,9 +269,10 @@ def get_dataset(spec, ep=ENTRY_POINT) -> Dataset:
ds = dataset_from_module(spec)
if ds:
return ds
+ return None
-def get_datasets(spec, ep=ENTRY_POINT, glob: bool = False) -> typing.List[Dataset]:
+def get_datasets(spec, ep=ENTRY_POINT, glob: bool = False) -> list[Dataset]:
"""
:param spec: Either `'*'` to get all datasets for a specific entry point, or glob pattern \
matching dataset modules in the current directory (if `glob == True`), or a `str` as accepted \
@@ -279,7 +285,7 @@ def get_datasets(spec, ep=ENTRY_POINT, glob: bool = False) -> typing.List[Datase
return nfilter([get_dataset(spec, ep=ep)])
-def dataset_from_module(path) -> typing.Union[Dataset, None]:
+def dataset_from_module(path: PathType) -> Optional[Dataset]:
"""
load the first `Dataset` subclass found in the module which does not have any subclasses.
"""
@@ -294,3 +300,4 @@ def dataset_from_module(path) -> typing.Union[Dataset, None]:
for _, obj in inspect.getmembers(mod):
if inspect.isclass(obj) and issubclass(obj, Dataset) and not obj.__subclasses__():
return obj()
+ return None
diff --git a/src/cldfbench/metadata.py b/src/cldfbench/metadata.py
index 24a6679..d6b944e 100644
--- a/src/cldfbench/metadata.py
+++ b/src/cldfbench/metadata.py
@@ -2,11 +2,11 @@
Dataset metadata
"""
import json
-import collections
import pathlib
-import typing
+from typing import Optional
+import collections
+import dataclasses
-import attr
from clldutils import licenses
from clldutils.misc import nfilter
from clldutils.markup import iter_markdown_tables
@@ -342,8 +342,8 @@
}
-@attr.s
-class Metadata(object):
+@dataclasses.dataclass
+class Metadata:
"""
Dataset metadata is used as follows:
@@ -358,23 +358,23 @@ class Metadata(object):
- add more `attr.ib` s,
- register the subclass with the dataset by assigning it to `cldfbench.Dataset.metadata_cls`.
"""
- id = attr.ib(
+ id: str = dataclasses.field(
default=None,
- metadata=dict(elicit=True, required=True))
- title = attr.ib(
+ metadata=dict(elicit=True, required=True)) # pylint: disable=R1735
+ title: str = dataclasses.field(
default=None,
- metadata=dict(elicit=True, required=True))
- description = attr.ib(
+ metadata=dict(elicit=True, required=True)) # pylint: disable=R1735
+ description: str = dataclasses.field(
default=None)
- license = attr.ib(
+ license: str = dataclasses.field(
default=None,
- metadata=dict(elicit=True, required=True))
- url = attr.ib(
+ metadata=dict(elicit=True, required=True)) # pylint: disable=R1735
+ url: str = dataclasses.field(
default=None,
- metadata=dict(elicit=True))
- citation = attr.ib(
+ metadata=dict(elicit=True)) # pylint: disable=R1735
+ citation: str = dataclasses.field(
default=None,
- metadata=dict(elicit=True, required=True))
+ metadata=dict(elicit=True, required=True)) # pylint: disable=R1735
@classmethod
def elicit(cls) -> 'Metadata':
@@ -382,10 +382,10 @@ def elicit(cls) -> 'Metadata':
Factory method, called when creating a new dataset directory.
"""
kw = {}
- for field in attr.fields(cls):
+ for field in dataclasses.fields(cls):
if field.metadata.get('elicit', False):
- res = input('{0}: '.format(field.name))
- if (not res) and field.default is not attr.NOTHING:
+ res = input(f'{field.name}: ')
+ if (not res) and field.default:
res = field.default
kw[field.name] = res
return cls(**kw)
@@ -397,25 +397,33 @@ def from_file(cls, fname: pathlib.Path) -> 'Metadata':
"""
with fname.open('r', encoding='utf-8') as fp:
try:
- return cls(**json.load(fp))
+ fields = {f.name for f in dataclasses.fields(cls)}
+ return cls(**{k: v for k, v in json.load(fp).items() if k in fields})
except json.decoder.JSONDecodeError as e: # pragma: no cover
- raise ValueError('Invalid JSON file: {}\n{}'.format(fname.resolve(), e))
+ raise ValueError(f'Invalid JSON file: {fname.resolve()}\n{e}') from e
def write(self, fname: pathlib.Path):
+ """Dump the metadata as JSON to disk."""
with fname.open('w', encoding='utf-8') as fp:
- return json.dump(attr.asdict(self), fp, indent=4)
+ return json.dump(dataclasses.asdict(self), fp, indent=4)
@property
- def known_license(self) -> typing.Union[None, licenses.License]:
+ def known_license(self) -> Optional[licenses.License]:
+ """
+ A known license - if one can be matched to self.license.
+ """
if self.license:
return licenses.find(self.license)
+ return None # pragma: no cover
@property
- def zenodo_license(self) -> str:
+ def zenodo_license(self) -> Optional[str]:
+ """A license ID suitable for inclusion in metadata for Zenodo."""
if self.known_license and self.known_license.id in LICENSES:
return self.known_license.id
+ return None # pragma: no cover
- def common_props(self) -> typing.Dict[str, object]:
+ def common_props(self) -> collections.OrderedDict[str, str]:
"""
The metadata as JSON-LD object suitable for inclusion in CLDF metadata.
"""
@@ -435,13 +443,14 @@ def common_props(self) -> typing.Dict[str, object]:
return res
def markdown(self) -> str:
+ """A human-readable version of the metadata formatted as Markdown."""
lines = [
- '# {0}\n'.format(self.title or 'Dataset {0}'.format(self.id)),
+ '# ' + (self.title or f'Dataset {self.id}') + '\n',
'## How to cite\n\nIf you use these data please cite',
]
if self.citation:
lines.append('- the original source')
- lines.extend([" > {}".format(line) for line in self.citation.split('\n')])
+ lines.extend([f" > {line}" for line in self.citation.split('\n')])
lines.extend([
"- the derived dataset using the DOI of the "
"[particular released version](../../releases/) you were using"
@@ -455,18 +464,25 @@ def markdown(self) -> str:
lines.append('\n## Description\n\n')
if self.description:
- lines.append('{0}\n'.format(self.description))
+ lines.append(f'{self.description}\n')
if self.license:
- lines.append('This dataset is licensed under a %s license\n' % self.license)
+ lines.append(f'This dataset is licensed under a {self.license} license\n')
if self.url:
- lines.append('Available online at %s\n' % self.url)
+ lines.append(f'Available online at {self.url}\n')
return '\n'.join(lines)
-def get_creators_and_contributors(text, strict=True) -> typing.Tuple[list, list]:
+TableRowsType = list[dict[str, str]]
+
+
+def get_creators_and_contributors(
+ text: str,
+ strict: bool = True,
+) -> tuple[TableRowsType, TableRowsType]:
+ """Read contributor info from a markdown formatted table."""
ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES}
creators, contributors = [], []
# Read first table in CONTRIBUTORS.md
diff --git a/src/cldfbench/scaffold.py b/src/cldfbench/scaffold.py
index 944dd88..c0c2660 100644
--- a/src/cldfbench/scaffold.py
+++ b/src/cldfbench/scaffold.py
@@ -13,35 +13,33 @@
import shutil
import pathlib
import warnings
+import dataclasses
+from collections.abc import Generator
-import attr
-
-import cldfbench
from cldfbench.metadata import Metadata
from cldfbench.util import get_entrypoints
__all__ = ['Template']
-def iter_scaffolds():
+def iter_scaffolds() -> Generator[tuple[str, type], None, None]:
+ """Yield registered cldfbench templates."""
yield 'cldfbench', Template
for ep in get_entrypoints('cldfbench.scaffold'):
try: # pragma: no cover
yield ep.name, ep.load()
- except Exception as e: # pragma: no cover
- warnings.warn(
- '{0} loading cldfbench.scaffold {1}: {2}'.format(
- e.__class__.__name__, ep.name, e))
+ except Exception as e: # pragma: no cover # pylint: disable=W0718
+ warnings.warn(f'{e.__class__.__name__} loading cldfbench.scaffold {ep.name}: {e}')
-class Template(object):
+class Template: # pylint: disable=R0903
"""A CLDF dataset suitable for curation in a GitHub repository"""
- prefix = cldfbench.__name__
- package = cldfbench.__name__
+ prefix = 'cldfbench'
+ package = 'cldfbench'
# To overwite individual template files, provide a secondary template directory which
# contains only the specialized template files.
- dirs = [pathlib.Path(cldfbench.__file__).parent / 'dataset_template']
+ dirs = [pathlib.Path(__file__).parent / 'dataset_template']
id_pattern = re.compile('[a-z_0-9]+$')
"""
@@ -54,20 +52,24 @@ class Template(object):
- assign the derived class to your template's `metadata` attribute.
E.g.
- >>> @attr.s
+ >>> @dataclasses.dataclass
... class CustomMetadata(Metadata):
- ... custom_var = attr.ib(default=None, metadata=dict(elicit=True))
+ ... custom_var: str = dataclasses.field(default=None, metadata=dict(elicit=True))
...
>>> class CustomTemplate(Template):
... metadata = CustomMetadata
"""
metadata = Metadata
- def render(self, outdir, metadata):
- # The cli will have used the class in `self.metadata` to elicit info from the user,
- # and pass `self.metadata(...)` as `metadata`
+ def render(self, outdir: pathlib.Path, metadata: Metadata):
+ """
+ .. note::
+
+ The cli will have used the class in `self.metadata` to elicit info from the user,
+ and pass `self.metadata(...)` as `metadata`
+ """
- ctx = attr.asdict(metadata)
+ ctx = dataclasses.asdict(metadata)
ctx.update(prefix=self.prefix, package=self.package)
if outdir.name != ctx['id']:
outdir = outdir / ctx['id']
diff --git a/src/cldfbench/util.py b/src/cldfbench/util.py
index 994eb48..03436a1 100644
--- a/src/cldfbench/util.py
+++ b/src/cldfbench/util.py
@@ -1,29 +1,52 @@
+"""
+Utilities.
+"""
import sys
import pathlib
+import platform
import subprocess
import importlib.metadata
-import platform
+from typing import Literal, Union
+from collections.abc import Iterable, Generator
+
+import termcolor
+
+from ._compat import entry_points_select
-def get_entrypoints(group):
- eps = importlib.metadata.entry_points()
- return eps.select(group=group) if hasattr(eps, 'select') else eps.get(group, [])
+def colored(color: Literal['red', 'blue'], text, **kw):
+ """Make termcolor.colored amenable to currying via functools.partial."""
+ return termcolor.colored(text, color, **kw)
-def iter_aligned(pairs, prefix=''):
+def get_entrypoints(group: str) -> Iterable[importlib.metadata.EntryPoint]:
+ """Get registered entry points for a group."""
+ return entry_points_select(importlib.metadata.entry_points(), group)
+
+
+def iter_aligned(
+ pairs: Iterable[Union[tuple[str, str], list[str]]],
+ prefix: str = '',
+ minspace: int = 1,
+) -> Generator[str, None, None]:
+ """
+ >>> print("\n".join(iter_aligned([('abc', '12'), ('x', '1234')], prefix='+')))
+ +abc 12
+ +x 1234
+ """
pairs = list(pairs) # make sure we can iterate twice over `pairs`
if pairs:
- maxlabel = max(len(p[0]) for p in pairs)
+ maxlabel = max(len(p[0]) for p in pairs) + minspace
for p in pairs:
- yield '{0}{1} {2}'.format(prefix, p[0].ljust(maxlabel), p[1] or '')
+ yield f"{prefix}{p[0].ljust(maxlabel)}{p[1] or ''}"
-def iter_requirements():
+def iter_requirements() -> Generator[str, None, None]:
"""
:return: generator of lines in pip's requirements.txt format, specifying packages which are \
imported in the current python process.
"""
- imported = set(m.split('.')[0].lower() for m in sys.modules.keys())
+ imported = set(m.split('.')[0].lower() for m in sys.modules)
pip = pathlib.Path(sys.executable).parent / 'pip'
if platform.system() == "Windows":
@@ -39,8 +62,8 @@ def iter_requirements():
try:
installed = subprocess.check_output([str(pip), 'freeze'])
- except subprocess.CalledProcessError: # pragma: no cover
- raise ValueError()
+ except subprocess.CalledProcessError as e: # pragma: no cover
+ raise ValueError() from e
for req in installed.decode('utf-8').split('\n'):
if '==' in req:
diff --git a/tests/fixtures/module_media_local.py b/tests/fixtures/module_media_local.py
new file mode 100644
index 0000000..cddf9cc
--- /dev/null
+++ b/tests/fixtures/module_media_local.py
@@ -0,0 +1,35 @@
+from cldfbench import Dataset, CLDFSpec
+
+
+class t_a:
+ name = 'origin'
+ url = 'https://github.com/lexibank/dataset.git'
+
+
+class t_b:
+ remotes = [t_a()]
+
+
+class t_c:
+ repo = t_b()
+ url = 'https://github.com/lexibank/dataset.git'
+
+ def json_ld(self):
+ pass # pragma: no cover
+
+
+class Thing(Dataset):
+ id = 'medialocal'
+ repo = t_c()
+
+ def cldf_specs(self): # pragma: no cover
+ return {None: Dataset.cldf_specs(self)}
+
+ def cmd_makecldf(self, args): # pragma: no cover
+ args.writer.cldf.add_component('MediaTable')
+ args.writer.objects['MediaTable'].append(
+ {'ID': '12345', 'Download_URL': 'Generic-metadata.json', 'Media_Type': 'application/json'}
+ )
+ args.writer.objects['MediaTable'].append(
+ {'ID': '12345', 'Download_URL': 'Generix-metadata.json', 'Media_Type': 'application/json'}
+ )
diff --git a/tests/test_catalogs.py b/tests/test_catalogs.py
index 91f7a25..212fee4 100644
--- a/tests/test_catalogs.py
+++ b/tests/test_catalogs.py
@@ -1,6 +1,9 @@
+import pytest
+
from cldfbench.catalogs import *
+@pytest.mark.with_catalog
def test_Glottolog(glottolog_dir):
cat = Glottolog(glottolog_dir)
assert cat.api.languoids(ids=['abcd1234'])
@@ -13,6 +16,7 @@ def test_Glottolog(glottolog_dir):
assert 'abcd1234' in cat.api.macroareas_by_glottocode
-def testConcepticon(concepticon_dir):
+@pytest.mark.with_catalog
+def test_Concepticon(concepticon_dir):
cat = Concepticon(concepticon_dir)
_ = cat.api.cached_glosses
diff --git a/tests/test_cldf.py b/tests/test_cldf.py
index fbb9939..f886f37 100644
--- a/tests/test_cldf.py
+++ b/tests/test_cldf.py
@@ -24,8 +24,6 @@ def test_cldf_spec(tmp_path):
def test_cldf(tmp_path):
- from cldfbench.cldf import WITH_ZIPPED
-
with pytest.raises(AttributeError):
_ = CLDFWriter().cldf
@@ -44,7 +42,7 @@ def test_cldf(tmp_path):
writer['ValueTable', 'value'].separator = '|'
writer.objects['ValueTable'].append(
dict(ID=1, Language_ID='l', Parameter_ID='p', Value=[1, 2]))
- assert (not WITH_ZIPPED) or tmp_path.joinpath('data.csv.zip').exists()
+ assert tmp_path.joinpath('data.csv.zip').exists()
ds = Dataset.from_metadata(tmp_path / 'StructureDataset-metadata.json')
values = list(ds['ValueTable'])
assert len(values) == 1
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1c4f2b4..770b607 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -29,10 +29,20 @@ def tmpds_media(fixtures_dir, tmp_path):
return tmp_path / 'module_media.py'
+@pytest.fixture
+def tmpds_media2(fixtures_dir, tmp_path):
+ for p in fixtures_dir.iterdir():
+ if p.is_file():
+ shutil.copy(p, tmp_path / p.name)
+ return tmp_path / 'module_media_local.py'
+
+
def _main(cmd, **kw):
+ kw.setdefault('log', logging.getLogger(__name__))
return cli.main(shlex.split('--no-config ' + cmd), **kw)
+@pytest.mark.with_catalog
def test_get_cldf_dataset(tmp_path, tmpds, glottolog_dir):
vals = tmp_path.joinpath('values.csv')
vals.write_text('ID,Language_ID,Parameter_ID,Value\n1,1,1,1', encoding='utf8')
@@ -48,6 +58,7 @@ def test_get_cldf_dataset(tmp_path, tmpds, glottolog_dir):
assert ds.module == 'StructureDataset'
+@pytest.mark.with_catalog
def test_cldfreadme(tmp_path, tmpds, glottolog_dir):
_main('makecldf ' + str(tmpds) + ' --with-zenodo --with-cldfreadme --glottolog ' +
str(glottolog_dir))
@@ -61,6 +72,7 @@ def test_help(capsys):
assert 'usage' in out
+@pytest.mark.with_catalog
def test_misc(tmp_path, mocker, glottolog_dir):
with pytest.raises(SystemExit):
_main('new --template=xyz')
@@ -100,6 +112,7 @@ def test_run(caplog, tmpds):
_main('run ' + str(tmpds) + ' raise')
+@pytest.mark.with_catalog
def test_readme(tmpds, tmp_path, glottolog_dir, mocker):
_main('readme ' + str(tmpds))
_main('makecldf ' + str(tmpds) + ' --glottolog ' + str(glottolog_dir))
@@ -139,6 +152,7 @@ def test_download(tmpds):
_main('download abc')
+@pytest.mark.with_catalog
def test_catinfo(capsys, glottolog_dir):
_main('catinfo --glottolog {0}'.format(glottolog_dir))
out, _ = capsys.readouterr()
@@ -172,12 +186,14 @@ def test_catalog_from_config(glottolog_dir, tmpds, mocker, tmp_path, fixtures_di
cli.main(['makecldf', str(tmpds)])
+@pytest.mark.with_catalog
def test_workflow(tmpds, glottolog_dir):
_main('makecldf ' + str(tmpds) + ' --glottolog ' + str(glottolog_dir))
assert _main('check ' + str(tmpds) + ' --with-validation', log=logging.getLogger(__name__)) == 1
_main('geojson ' + str(tmpds))
+@pytest.mark.with_catalog
def test_diff(tmpds, mocker, caplog, glottolog_dir, csvw3):
class Item:
def __init__(self, p):
@@ -223,6 +239,7 @@ def test_check(tmpds, tmp_path):
assert _main('check ' + str(tmpds), log=logging.getLogger(__name__)) == 0
+@pytest.mark.with_catalog
def test_media(tmpds_media, tmp_path, glottolog_dir, capsys, mocker):
releasedir = pathlib.Path('thing_{}'.format(MEDIA))
zipfile_name = pathlib.Path('{}.zip'.format(MEDIA))
@@ -248,15 +265,27 @@ def urlretrieve(*args):
assert 'application/pdf' not in capturedout
with pytest.raises(SystemExit):
- _main('media -m wav --create-release -p 10.5072/zenodo.710757 ' + str(tmpds_media))
- with pytest.raises(SystemExit):
- _main('media --create-release --update-zendo ' + str(tmpds_media))
+ _main('media -m wav -p 10.5072/zenodo.710757 ' + str(tmpds_media))
with pytest.raises(SystemExit):
- _main('media --create-release ' + str(tmpds_media))
+ _main('media ' + str(tmpds_media))
- _main('media -o ' + str(tmp_path) + ' -m wav --create-release -p 10.5281/zenodo.4350882 ' + str(tmpds_media))
+ _main('media -o ' + str(tmp_path) + ' -m wav -p 10.5281/zenodo.4350882 ' + str(tmpds_media))
assert (tmp_path / MEDIA / INDEX_CSV).exists()
assert (tmp_path / MEDIA / wav_name[:2] / wav_name).exists()
assert (tmp_path / releasedir / zipfile_name).exists()
assert (tmp_path / releasedir / 'README.md').exists()
assert (tmp_path / releasedir / ZENODO_FILE_NAME).exists()
+
+
+@pytest.mark.with_catalog
+def test_media2(tmpds_media2, tmp_path, glottolog_dir, capsys):
+ _main('makecldf ' + str(tmpds_media2) + ' --glottolog ' + str(glottolog_dir))
+
+ _main('media -l ' + str(tmpds_media2))
+ capturedout = capsys.readouterr().out
+ assert 'application/json' in capturedout
+
+ _main('media -o ' + str(tmp_path) + ' -p 10.5281/zenodo.4350882 ' + str(tmpds_media2))
+ assert (tmp_path / MEDIA / INDEX_CSV).exists()
+ assert 'local_path' in (tmp_path / MEDIA / INDEX_CSV).read_text(encoding='utf8')
+ assert (tmp_path / MEDIA / '12' / '12345.json').exists()
diff --git a/tests/test_datadir.py b/tests/test_datadir.py
index 8a31962..ce275ae 100644
--- a/tests/test_datadir.py
+++ b/tests/test_datadir.py
@@ -1,6 +1,9 @@
+import logging
import sys
import gzip
import shutil
+import contextlib
+import urllib.error
import pytest
@@ -15,9 +18,13 @@ def datadir(tmp_path, fixtures_dir):
return DataDir(tmp_path)
-def test_get_url(mocker):
- mocker.patch('cldfbench.datadir.requests', mocker.Mock(get=mocker.Mock()))
- get_url(None, log=mocker.Mock(warn=mocker.Mock()))
+@pytest.mark.with_internet
+def test_urlopen():
+ try:
+ with urlopen('https://httpbin.org/delay/2', timeout=0.01) as res:
+ assert res.status in (404, 201) # pragma: no cover
+ except urllib.error.URLError as e:
+ assert ('timed out' in str(e)) or ('failure in name resolution' in str(e))
def test_datadir(datadir):
@@ -81,14 +88,18 @@ def test_datadir_ods(datadir):
assert len(data3) == 4
-def test_datadir_download_and_unpack(datadir, mocker):
- mocker.patch(
- 'cldfbench.datadir.get_url',
- mocker.Mock(
- return_value=mocker.Mock(
- iter_content=mocker.Mock(
- return_value=[datadir.joinpath('test.zip').open('rb').read()]))))
- datadir.download_and_unpack(None)
+def test_datadir_download_and_unpack(datadir, mocker, caplog):
+ @contextlib.contextmanager
+ def mock_urlopen(*args, **kw):
+ yield mocker.Mock(status=201, read=lambda: datadir.joinpath('test.zip').open('rb').read())
+
+ mocker.patch('cldfbench.datadir.urlopen', mock_urlopen)
+ datadir.download_and_unpack('')
assert datadir.joinpath('setup.py').exists()
- datadir.download(None, 'fname')
- datadir.download(None, 'fname', skip_if_exists=True)
+ with caplog.at_level(logging.INFO):
+ datadir.download('x', 'fname', log=logging.getLogger(__name__))
+ assert len(caplog.records) == 1
+ assert 'x' in caplog.records[0].message
+ assert caplog.records[0].levelname == 'warning'.upper()
+
+ datadir.download('', 'fname', skip_if_exists=True)
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index e074340..4222f36 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -1,3 +1,5 @@
+from clldutils.jsonlib import update
+
from cldfbench.metadata import *
@@ -5,6 +7,8 @@ def test_Metadata_read_write(tmp_path):
fname = tmp_path / 'md.json'
md = Metadata()
md.write(fname)
+ with update(fname) as d:
+ d['key'] = 'value'
assert Metadata.from_file(fname) == md
diff --git a/tests/test_scaffold.py b/tests/test_scaffold.py
index c6bbee8..31e71fc 100644
--- a/tests/test_scaffold.py
+++ b/tests/test_scaffold.py
@@ -1,13 +1,13 @@
-import attr
+import dataclasses
from cldfbench.scaffold import Template, Metadata
def test_custom_template(tmp_path, mocker, fixtures_dir):
- @attr.s
+ @dataclasses.dataclass
class CustomMetadata(Metadata):
- id = attr.ib(default='abc', metadata=dict(elicit=True))
- custom_var = attr.ib(default='xyz', metadata=dict(elicit=True))
+ id: str = dataclasses.field(default='abc', metadata=dict(elicit=True))
+ custom_var: str = dataclasses.field(default='xyz', metadata=dict(elicit=True))
class Custom(Template):
package = 'pylexibank'