From 7d67053a901df26e8c76ec8f8ac88bc47c1b9e04 Mon Sep 17 00:00:00 2001 From: Viktor Petersson Date: Mon, 1 Jun 2026 07:08:21 +0000 Subject: [PATCH 1/3] fix(assets): repair double-encoded UTF-8 in asset names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add `repair_mojibake` helper that safely undoes the classic `UTF-8 bytes decoded as Latin-1` corruption (e.g. `Formulários` → `Formulários`); no-op on correctly-encoded text - apply it in `Asset.save` so every write path (web form, all API versions, legacy import) stores a clean name - one-time data migration repairs names stored before the guardrail Co-Authored-By: Claude Opus 4.8 (1M context) --- .../0007_repair_mojibake_asset_names.py | 56 ++++++++++++ src/anthias_server/app/models.py | 45 ++++++++++ tests/test_mojibake_repair.py | 88 +++++++++++++++++++ 3 files changed, 189 insertions(+) create mode 100644 src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py create mode 100644 tests/test_mojibake_repair.py diff --git a/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py new file mode 100644 index 000000000..4dc3ef9a9 --- /dev/null +++ b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py @@ -0,0 +1,56 @@ +"""One-time repair of double-encoded UTF-8 in existing ``Asset.name``. + +A misbehaving uploader that double-encodes a filename stores e.g. +``Formulários`` instead of ``Formulários`` (the UTF-8 bytes of ``á`` +read back as the Latin-1 chars ``Ã`` + ``¡``). Anthias never produces +this itself, but it stored whatever the request body carried, so +already-uploaded assets keep the garbled name in the UI and in the +viewer's ``Showing asset …`` log line. ``Asset.save`` now repairs new +writes; this migration fixes the rows that pre-date that guardrail. + +The repair logic is inlined rather than imported from +``anthias_server.app.models`` on purpose — migrations are frozen +snapshots of intent, and a future change to the model helper must not +retroactively alter what this one-time data fix did. + +Idempotent and safe: a name is rewritten only when every character is +in the Latin-1 range *and* those bytes form a valid, different UTF-8 +string — the unambiguous signature of double-encoded UTF-8. Correctly +stored names (``Formulários``, ``Café``, ``日本語``) raise on one of the +two steps and are left untouched, so a re-run changes nothing. +""" + +from __future__ import annotations + +from django.db import migrations + + +def _repair_mojibake(text): # type: ignore[no-untyped-def] + if not text: + return text + try: + repaired = text.encode('latin-1').decode('utf-8') + except (UnicodeEncodeError, UnicodeDecodeError): + return text + return repaired + + +def _repair_names(apps, schema_editor): # type: ignore[no-untyped-def] + asset_model = apps.get_model('anthias_app', 'Asset') + for asset in asset_model.objects.exclude(name__isnull=True): + repaired = _repair_mojibake(asset.name) + if repaired != asset.name: + asset.name = repaired + asset.save(update_fields=['name']) + + +class Migration(migrations.Migration): + dependencies = [ + ('anthias_app', '0006_asset_metadata'), + ] + + operations = [ + migrations.RunPython( + _repair_names, reverse_code=migrations.RunPython.noop + ), + ] diff --git a/src/anthias_server/app/models.py b/src/anthias_server/app/models.py index c5d215d98..373c52c56 100644 --- a/src/anthias_server/app/models.py +++ b/src/anthias_server/app/models.py @@ -39,6 +39,36 @@ def clamp_refresh_interval(value: Any) -> int: return max(0, min(interval, REFRESH_INTERVAL_S_MAX)) +def repair_mojibake(text: str | None) -> str | None: + """Undo the classic ``UTF-8 bytes decoded as Latin-1`` mojibake. + + A misbehaving uploader that double-encodes a filename turns + ``Formulários`` into ``Formulários`` (the UTF-8 bytes ``\\xc3\\xa1`` + of ``á`` read back as the two Latin-1 chars ``Ã`` + ``¡``). Anthias + itself never does this — Django's multipart parser and DRF both + decode as UTF-8 — but the corrupted text arrives already mangled in + the request body and we would otherwise store it verbatim, so the + operator sees garbled asset names in the UI and in the viewer's + ``Showing asset …`` log line. + + The repair is deliberately conservative and deterministic: it only + fires when *every* character is in the Latin-1 range (so + ``encode('latin-1')`` round-trips) **and** those bytes form a valid, + *different* UTF-8 string. That combination is the unambiguous + signature of double-encoded UTF-8 — a correctly-stored + ``Formulários``, ``Café``, or ``日本語`` raises on one of the two + steps and is returned untouched, so this can't corrupt good data. + Idempotent: re-running on already-repaired text is a no-op. + """ + if not text: + return text + try: + repaired = text.encode('latin-1').decode('utf-8') + except (UnicodeEncodeError, UnicodeDecodeError): + return text + return repaired + + def generate_asset_id() -> str: return uuid.uuid4().hex @@ -84,6 +114,21 @@ class Meta: def __str__(self) -> str: return str(self.name) + def save(self, *args: Any, **kwargs: Any) -> None: + """Repair double-encoded UTF-8 in ``name`` before persisting. + + A single write-side chokepoint so every create/update path — + the web form, all four API versions, and the legacy Screenly + import — stores a clean name regardless of an upstream client + that double-encoded the filename. See ``repair_mojibake`` for + why this is safe (no-op on correctly-encoded text). The repair + is cheap and idempotent, so running it on every save (including + reachability/processing-flag updates that leave ``name`` + unchanged) costs nothing. + """ + self.name = repair_mojibake(self.name) + super().save(*args, **kwargs) + def get_play_days(self) -> list[int]: """Parse play_days into a sorted, deduped list of ints 1-7. diff --git a/tests/test_mojibake_repair.py b/tests/test_mojibake_repair.py new file mode 100644 index 000000000..50f02b92c --- /dev/null +++ b/tests/test_mojibake_repair.py @@ -0,0 +1,88 @@ +"""Tests for the double-encoded-UTF-8 (mojibake) repair on asset names. + +Covers the pure helper, the write-side ``Asset.save`` guardrail (which +catches new uploads from any API version or the web form), and the +data-migration logic that fixes rows stored before the guardrail +existed. +""" + +import pytest + +from anthias_server.app.models import Asset, repair_mojibake + +# ``Formulários`` round-tripped through the classic ``UTF-8 bytes read +# as Latin-1`` corruption. Kept as the canonical mojibake fixture so the +# intent is obvious without sprinkling non-ASCII escapes through the +# assertions. +GARBLED = 'Formulários'.encode('utf-8').decode('latin-1') + + +@pytest.mark.parametrize( + 'given, expected', + [ + # Genuine mojibake — the one case we repair. + (GARBLED, 'Formulários'), + # Correctly-encoded text must survive untouched: a multi-byte + # accent, a name with several Latin-1 accents, and CJK each + # raise on the encode or decode step and short-circuit. + ('Formulários', 'Formulários'), + ('Café Über señor', 'Café Über señor'), + ('日本語', '日本語'), + # ASCII / empty / None are no-ops. + ('Plain Name 2', 'Plain Name 2'), + ('', ''), + (None, None), + # A lone Latin-1 lead byte is not valid UTF-8 once re-decoded, + # so it is left alone rather than mangled. + ('Ã', 'Ã'), + ], +) +def test_repair_mojibake(given: str | None, expected: str | None) -> None: + assert repair_mojibake(given) == expected + + +def test_repair_mojibake_is_idempotent() -> None: + once = repair_mojibake(GARBLED) + assert repair_mojibake(once) == once == 'Formulários' + + +@pytest.mark.django_db +def test_save_repairs_mojibake_name() -> None: + asset = Asset.objects.create(name=GARBLED, mimetype='image') + asset.refresh_from_db() + assert asset.name == 'Formulários' + + +@pytest.mark.django_db +def test_save_leaves_clean_name_untouched() -> None: + asset = Asset.objects.create(name='Café Über señor', mimetype='image') + asset.refresh_from_db() + assert asset.name == 'Café Über señor' + + +@pytest.mark.django_db +def test_migration_repairs_existing_rows() -> None: + """The migration's repair pass fixes pre-existing garbled rows. + + ``Asset.save`` now cleans names on write, so to exercise the + migration's own logic against a *stored* mojibake row we write the + column directly with ``QuerySet.update`` (which bypasses ``save``). + """ + import importlib + + migration = importlib.import_module( + 'anthias_server.app.migrations.0007_repair_mojibake_asset_names' + ) + + asset = Asset.objects.create(name='placeholder', mimetype='image') + Asset.objects.filter(pk=asset.pk).update(name=GARBLED) + + class _Apps: + @staticmethod + def get_model(app_label: str, model_name: str) -> type[Asset]: + return Asset + + migration._repair_names(_Apps(), None) + + asset.refresh_from_db() + assert asset.name == 'Formulários' From 236a3b17a542cb6d0cd7ed05d5cdd474cb2fd80e Mon Sep 17 00:00:00 2001 From: Viktor Petersson Date: Mon, 1 Jun 2026 15:44:10 +0000 Subject: [PATCH 2/3] fix(assets): address Copilot review on mojibake repair MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Soften the safety claim in repair_mojibake / migration docstrings: the Latin-1 + valid-UTF-8 test is a strong heuristic, not a proof. A genuinely Latin-1 name whose bytes are also valid UTF-8 (e.g. `©` → `©`) is indistinguishable from mojibake and gets rewritten; document the accepted trade-off instead of overclaiming "can't corrupt good data". - Return the original string when the repair produces no change, to match the stated "only rewrites when different" contract. - Stream the migration's rows with .only('asset_id', 'name').iterator() instead of caching the whole assets table in memory. - Add a regression test pinning the documented `©` false positive. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../0007_repair_mojibake_asset_names.py | 24 +++++++++++++------ src/anthias_server/app/models.py | 19 +++++++++------ tests/test_mojibake_repair.py | 5 ++++ 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py index 4dc3ef9a9..d316d0a78 100644 --- a/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py +++ b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py @@ -13,11 +13,14 @@ snapshots of intent, and a future change to the model helper must not retroactively alter what this one-time data fix did. -Idempotent and safe: a name is rewritten only when every character is -in the Latin-1 range *and* those bytes form a valid, different UTF-8 -string — the unambiguous signature of double-encoded UTF-8. Correctly -stored names (``Formulários``, ``Café``, ``日本語``) raise on one of the -two steps and are left untouched, so a re-run changes nothing. +Idempotent: a name is rewritten only when every character is in the +Latin-1 range *and* those bytes form a valid UTF-8 string that differs +from the input — a strong heuristic for double-encoded UTF-8, though not +a proof (a genuinely Latin-1 name whose bytes are also valid UTF-8, e.g. +``©`` → ``©``, is indistinguishable and gets rewritten too; such +collisions are vanishingly rare in real filenames). Correctly stored +names (``Formulários``, ``Café``, ``日本語``) raise on the encode or +decode step and are left untouched, so a re-run changes nothing. """ from __future__ import annotations @@ -32,12 +35,19 @@ def _repair_mojibake(text): # type: ignore[no-untyped-def] repaired = text.encode('latin-1').decode('utf-8') except (UnicodeEncodeError, UnicodeDecodeError): return text - return repaired + return repaired if repaired != text else text def _repair_names(apps, schema_editor): # type: ignore[no-untyped-def] asset_model = apps.get_model('anthias_app', 'Asset') - for asset in asset_model.objects.exclude(name__isnull=True): + # ``.only()`` + ``.iterator()`` streams rows in chunks instead of + # caching the whole table in memory; per-row ``save`` still works. + rows = ( + asset_model.objects.exclude(name__isnull=True) + .only('asset_id', 'name') + .iterator() + ) + for asset in rows: repaired = _repair_mojibake(asset.name) if repaired != asset.name: asset.name = repaired diff --git a/src/anthias_server/app/models.py b/src/anthias_server/app/models.py index 373c52c56..fd8c8f5a9 100644 --- a/src/anthias_server/app/models.py +++ b/src/anthias_server/app/models.py @@ -53,12 +53,17 @@ def repair_mojibake(text: str | None) -> str | None: The repair is deliberately conservative and deterministic: it only fires when *every* character is in the Latin-1 range (so - ``encode('latin-1')`` round-trips) **and** those bytes form a valid, - *different* UTF-8 string. That combination is the unambiguous - signature of double-encoded UTF-8 — a correctly-stored - ``Formulários``, ``Café``, or ``日本語`` raises on one of the two - steps and is returned untouched, so this can't corrupt good data. - Idempotent: re-running on already-repaired text is a no-op. + ``encode('latin-1')`` round-trips) **and** those bytes form a valid + UTF-8 string, which is then returned only if it actually differs from + the input. That is a strong heuristic for double-encoded UTF-8, but + not a proof: a name that is *genuinely* Latin-1 yet whose bytes also + happen to be valid UTF-8 (e.g. ``©`` → ``©``) is indistinguishable + from mojibake and gets rewritten too. Such collisions are vanishingly + rare in real asset filenames, and the alternative — leaving every + ``Formulários`` garbled — is worse, so we accept the trade-off. + Correctly-stored ``Formulários``, ``Café``, or ``日本語`` raise on the + encode or decode step and are returned untouched. Idempotent: + re-running on already-repaired text is a no-op. """ if not text: return text @@ -66,7 +71,7 @@ def repair_mojibake(text: str | None) -> str | None: repaired = text.encode('latin-1').decode('utf-8') except (UnicodeEncodeError, UnicodeDecodeError): return text - return repaired + return repaired if repaired != text else text def generate_asset_id() -> str: diff --git a/tests/test_mojibake_repair.py b/tests/test_mojibake_repair.py index 50f02b92c..0bbb5c3b0 100644 --- a/tests/test_mojibake_repair.py +++ b/tests/test_mojibake_repair.py @@ -35,6 +35,11 @@ # A lone Latin-1 lead byte is not valid UTF-8 once re-decoded, # so it is left alone rather than mangled. ('Ã', 'Ã'), + # Documented false positive: a genuinely Latin-1 ``©`` (U+00C2 + # U+00A9) has bytes that are also valid UTF-8 (``©``), so it is + # indistinguishable from mojibake and gets rewritten. Accepted + # trade-off — see ``repair_mojibake``'s docstring. + ('©', '©'), ], ) def test_repair_mojibake(given: str | None, expected: str | None) -> None: From e2e6b2ec4a960b31dcd602dd21498137b432a35a Mon Sep 17 00:00:00 2001 From: Viktor Petersson Date: Mon, 1 Jun 2026 15:57:02 +0000 Subject: [PATCH 3/3] fix(assets): skip name repair on update_fields saves without name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Asset.save() repaired self.name unconditionally, so a metadata-only or reachability save (update_fields excluding 'name', e.g. v2 views' save(update_fields=['metadata'])) would mutate the in-memory name without writing the column — diverging the instance from the stored row and leaving any DB-side garble unrepaired. Repair only when name is actually persisted (update_fields is None or includes 'name'), and add a regression test for the metadata-only path. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/anthias_server/app/models.py | 15 ++++++++++----- tests/test_mojibake_repair.py | 22 ++++++++++++++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/anthias_server/app/models.py b/src/anthias_server/app/models.py index fd8c8f5a9..e71978af3 100644 --- a/src/anthias_server/app/models.py +++ b/src/anthias_server/app/models.py @@ -126,12 +126,17 @@ def save(self, *args: Any, **kwargs: Any) -> None: the web form, all four API versions, and the legacy Screenly import — stores a clean name regardless of an upstream client that double-encoded the filename. See ``repair_mojibake`` for - why this is safe (no-op on correctly-encoded text). The repair - is cheap and idempotent, so running it on every save (including - reachability/processing-flag updates that leave ``name`` - unchanged) costs nothing. + why this is safe (no-op on correctly-encoded text). + + Skipped when ``update_fields`` is passed without ``name`` (e.g. + the metadata-only and reachability saves): those writes don't + touch the column, so repairing ``self.name`` there would mutate + the in-memory instance without persisting it — diverging from + the stored row while leaving the DB value unrepaired. """ - self.name = repair_mojibake(self.name) + update_fields = kwargs.get('update_fields') + if update_fields is None or 'name' in update_fields: + self.name = repair_mojibake(self.name) super().save(*args, **kwargs) def get_play_days(self) -> list[int]: diff --git a/tests/test_mojibake_repair.py b/tests/test_mojibake_repair.py index 0bbb5c3b0..4c6000a0d 100644 --- a/tests/test_mojibake_repair.py +++ b/tests/test_mojibake_repair.py @@ -65,6 +65,28 @@ def test_save_leaves_clean_name_untouched() -> None: assert asset.name == 'Café Über señor' +@pytest.mark.django_db +def test_save_skips_repair_when_name_not_in_update_fields() -> None: + """A metadata-only save must not silently rewrite ``name``. + + With ``update_fields`` excluding ``name`` the column isn't written, + so repairing ``self.name`` would only diverge the in-memory instance + from the stored row. The repair is skipped and the stored value is + left as-is. + """ + asset = Asset.objects.create(name='placeholder', mimetype='image') + Asset.objects.filter(pk=asset.pk).update(name=GARBLED) + asset.refresh_from_db() + + asset.metadata = {'foo': 'bar'} + asset.save(update_fields=['metadata']) + + # In-memory name untouched, and the DB still holds the raw value. + assert asset.name == GARBLED + asset.refresh_from_db() + assert asset.name == GARBLED + + @pytest.mark.django_db def test_migration_repairs_existing_rows() -> None: """The migration's repair pass fixes pre-existing garbled rows.