diff --git a/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py new file mode 100644 index 000000000..d316d0a78 --- /dev/null +++ b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py @@ -0,0 +1,66 @@ +"""One-time repair of double-encoded UTF-8 in existing ``Asset.name``. + +A misbehaving uploader that double-encodes a filename stores e.g. +``Formulários`` instead of ``Formulários`` (the UTF-8 bytes of ``á`` +read back as the Latin-1 chars ``Ã`` + ``¡``). Anthias never produces +this itself, but it stored whatever the request body carried, so +already-uploaded assets keep the garbled name in the UI and in the +viewer's ``Showing asset …`` log line. ``Asset.save`` now repairs new +writes; this migration fixes the rows that pre-date that guardrail. + +The repair logic is inlined rather than imported from +``anthias_server.app.models`` on purpose — migrations are frozen +snapshots of intent, and a future change to the model helper must not +retroactively alter what this one-time data fix did. + +Idempotent: a name is rewritten only when every character is in the +Latin-1 range *and* those bytes form a valid UTF-8 string that differs +from the input — a strong heuristic for double-encoded UTF-8, though not +a proof (a genuinely Latin-1 name whose bytes are also valid UTF-8, e.g. +``©`` → ``©``, is indistinguishable and gets rewritten too; such +collisions are vanishingly rare in real filenames). Correctly stored +names (``Formulários``, ``Café``, ``日本語``) raise on the encode or +decode step and are left untouched, so a re-run changes nothing. +""" + +from __future__ import annotations + +from django.db import migrations + + +def _repair_mojibake(text): # type: ignore[no-untyped-def] + if not text: + return text + try: + repaired = text.encode('latin-1').decode('utf-8') + except (UnicodeEncodeError, UnicodeDecodeError): + return text + return repaired if repaired != text else text + + +def _repair_names(apps, schema_editor): # type: ignore[no-untyped-def] + asset_model = apps.get_model('anthias_app', 'Asset') + # ``.only()`` + ``.iterator()`` streams rows in chunks instead of + # caching the whole table in memory; per-row ``save`` still works. + rows = ( + asset_model.objects.exclude(name__isnull=True) + .only('asset_id', 'name') + .iterator() + ) + for asset in rows: + repaired = _repair_mojibake(asset.name) + if repaired != asset.name: + asset.name = repaired + asset.save(update_fields=['name']) + + +class Migration(migrations.Migration): + dependencies = [ + ('anthias_app', '0006_asset_metadata'), + ] + + operations = [ + migrations.RunPython( + _repair_names, reverse_code=migrations.RunPython.noop + ), + ] diff --git a/src/anthias_server/app/models.py b/src/anthias_server/app/models.py index c5d215d98..e71978af3 100644 --- a/src/anthias_server/app/models.py +++ b/src/anthias_server/app/models.py @@ -39,6 +39,41 @@ def clamp_refresh_interval(value: Any) -> int: return max(0, min(interval, REFRESH_INTERVAL_S_MAX)) +def repair_mojibake(text: str | None) -> str | None: + """Undo the classic ``UTF-8 bytes decoded as Latin-1`` mojibake. + + A misbehaving uploader that double-encodes a filename turns + ``Formulários`` into ``Formulários`` (the UTF-8 bytes ``\\xc3\\xa1`` + of ``á`` read back as the two Latin-1 chars ``Ã`` + ``¡``). Anthias + itself never does this — Django's multipart parser and DRF both + decode as UTF-8 — but the corrupted text arrives already mangled in + the request body and we would otherwise store it verbatim, so the + operator sees garbled asset names in the UI and in the viewer's + ``Showing asset …`` log line. + + The repair is deliberately conservative and deterministic: it only + fires when *every* character is in the Latin-1 range (so + ``encode('latin-1')`` round-trips) **and** those bytes form a valid + UTF-8 string, which is then returned only if it actually differs from + the input. That is a strong heuristic for double-encoded UTF-8, but + not a proof: a name that is *genuinely* Latin-1 yet whose bytes also + happen to be valid UTF-8 (e.g. ``©`` → ``©``) is indistinguishable + from mojibake and gets rewritten too. Such collisions are vanishingly + rare in real asset filenames, and the alternative — leaving every + ``Formulários`` garbled — is worse, so we accept the trade-off. + Correctly-stored ``Formulários``, ``Café``, or ``日本語`` raise on the + encode or decode step and are returned untouched. Idempotent: + re-running on already-repaired text is a no-op. + """ + if not text: + return text + try: + repaired = text.encode('latin-1').decode('utf-8') + except (UnicodeEncodeError, UnicodeDecodeError): + return text + return repaired if repaired != text else text + + def generate_asset_id() -> str: return uuid.uuid4().hex @@ -84,6 +119,26 @@ class Meta: def __str__(self) -> str: return str(self.name) + def save(self, *args: Any, **kwargs: Any) -> None: + """Repair double-encoded UTF-8 in ``name`` before persisting. + + A single write-side chokepoint so every create/update path — + the web form, all four API versions, and the legacy Screenly + import — stores a clean name regardless of an upstream client + that double-encoded the filename. See ``repair_mojibake`` for + why this is safe (no-op on correctly-encoded text). + + Skipped when ``update_fields`` is passed without ``name`` (e.g. + the metadata-only and reachability saves): those writes don't + touch the column, so repairing ``self.name`` there would mutate + the in-memory instance without persisting it — diverging from + the stored row while leaving the DB value unrepaired. + """ + update_fields = kwargs.get('update_fields') + if update_fields is None or 'name' in update_fields: + self.name = repair_mojibake(self.name) + super().save(*args, **kwargs) + def get_play_days(self) -> list[int]: """Parse play_days into a sorted, deduped list of ints 1-7. diff --git a/tests/test_mojibake_repair.py b/tests/test_mojibake_repair.py new file mode 100644 index 000000000..4c6000a0d --- /dev/null +++ b/tests/test_mojibake_repair.py @@ -0,0 +1,115 @@ +"""Tests for the double-encoded-UTF-8 (mojibake) repair on asset names. + +Covers the pure helper, the write-side ``Asset.save`` guardrail (which +catches new uploads from any API version or the web form), and the +data-migration logic that fixes rows stored before the guardrail +existed. +""" + +import pytest + +from anthias_server.app.models import Asset, repair_mojibake + +# ``Formulários`` round-tripped through the classic ``UTF-8 bytes read +# as Latin-1`` corruption. Kept as the canonical mojibake fixture so the +# intent is obvious without sprinkling non-ASCII escapes through the +# assertions. +GARBLED = 'Formulários'.encode('utf-8').decode('latin-1') + + +@pytest.mark.parametrize( + 'given, expected', + [ + # Genuine mojibake — the one case we repair. + (GARBLED, 'Formulários'), + # Correctly-encoded text must survive untouched: a multi-byte + # accent, a name with several Latin-1 accents, and CJK each + # raise on the encode or decode step and short-circuit. + ('Formulários', 'Formulários'), + ('Café Über señor', 'Café Über señor'), + ('日本語', '日本語'), + # ASCII / empty / None are no-ops. + ('Plain Name 2', 'Plain Name 2'), + ('', ''), + (None, None), + # A lone Latin-1 lead byte is not valid UTF-8 once re-decoded, + # so it is left alone rather than mangled. + ('Ã', 'Ã'), + # Documented false positive: a genuinely Latin-1 ``©`` (U+00C2 + # U+00A9) has bytes that are also valid UTF-8 (``©``), so it is + # indistinguishable from mojibake and gets rewritten. Accepted + # trade-off — see ``repair_mojibake``'s docstring. + ('©', '©'), + ], +) +def test_repair_mojibake(given: str | None, expected: str | None) -> None: + assert repair_mojibake(given) == expected + + +def test_repair_mojibake_is_idempotent() -> None: + once = repair_mojibake(GARBLED) + assert repair_mojibake(once) == once == 'Formulários' + + +@pytest.mark.django_db +def test_save_repairs_mojibake_name() -> None: + asset = Asset.objects.create(name=GARBLED, mimetype='image') + asset.refresh_from_db() + assert asset.name == 'Formulários' + + +@pytest.mark.django_db +def test_save_leaves_clean_name_untouched() -> None: + asset = Asset.objects.create(name='Café Über señor', mimetype='image') + asset.refresh_from_db() + assert asset.name == 'Café Über señor' + + +@pytest.mark.django_db +def test_save_skips_repair_when_name_not_in_update_fields() -> None: + """A metadata-only save must not silently rewrite ``name``. + + With ``update_fields`` excluding ``name`` the column isn't written, + so repairing ``self.name`` would only diverge the in-memory instance + from the stored row. The repair is skipped and the stored value is + left as-is. + """ + asset = Asset.objects.create(name='placeholder', mimetype='image') + Asset.objects.filter(pk=asset.pk).update(name=GARBLED) + asset.refresh_from_db() + + asset.metadata = {'foo': 'bar'} + asset.save(update_fields=['metadata']) + + # In-memory name untouched, and the DB still holds the raw value. + assert asset.name == GARBLED + asset.refresh_from_db() + assert asset.name == GARBLED + + +@pytest.mark.django_db +def test_migration_repairs_existing_rows() -> None: + """The migration's repair pass fixes pre-existing garbled rows. + + ``Asset.save`` now cleans names on write, so to exercise the + migration's own logic against a *stored* mojibake row we write the + column directly with ``QuerySet.update`` (which bypasses ``save``). + """ + import importlib + + migration = importlib.import_module( + 'anthias_server.app.migrations.0007_repair_mojibake_asset_names' + ) + + asset = Asset.objects.create(name='placeholder', mimetype='image') + Asset.objects.filter(pk=asset.pk).update(name=GARBLED) + + class _Apps: + @staticmethod + def get_model(app_label: str, model_name: str) -> type[Asset]: + return Asset + + migration._repair_names(_Apps(), None) + + asset.refresh_from_db() + assert asset.name == 'Formulários'