From 7d67053a901df26e8c76ec8f8ac88bc47c1b9e04 Mon Sep 17 00:00:00 2001
From: Viktor Petersson <vpetersson@screenly.io>
Date: Mon, 1 Jun 2026 07:08:21 +0000
Subject: [PATCH 1/3] fix(assets): repair double-encoded UTF-8 in asset names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- add `repair_mojibake` helper that safely undoes the classic
  `UTF-8 bytes decoded as Latin-1` corruption (e.g. `FormulÃ¡rios`
  → `Formulários`); no-op on correctly-encoded text
- apply it in `Asset.save` so every write path (web form, all API
  versions, legacy import) stores a clean name
- one-time data migration repairs names stored before the guardrail

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../0007_repair_mojibake_asset_names.py       | 56 ++++++++++++
 src/anthias_server/app/models.py              | 45 ++++++++++
 tests/test_mojibake_repair.py                 | 88 +++++++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py
 create mode 100644 tests/test_mojibake_repair.py

diff --git a/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py
new file mode 100644
index 000000000..4dc3ef9a9
--- /dev/null
+++ b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py
@@ -0,0 +1,56 @@
+"""One-time repair of double-encoded UTF-8 in existing ``Asset.name``.
+
+A misbehaving uploader that double-encodes a filename stores e.g.
+``FormulÃ¡rios`` instead of ``Formulários`` (the UTF-8 bytes of ``á``
+read back as the Latin-1 chars ``Ã`` + ``¡``). Anthias never produces
+this itself, but it stored whatever the request body carried, so
+already-uploaded assets keep the garbled name in the UI and in the
+viewer's ``Showing asset …`` log line. ``Asset.save`` now repairs new
+writes; this migration fixes the rows that pre-date that guardrail.
+
+The repair logic is inlined rather than imported from
+``anthias_server.app.models`` on purpose — migrations are frozen
+snapshots of intent, and a future change to the model helper must not
+retroactively alter what this one-time data fix did.
+
+Idempotent and safe: a name is rewritten only when every character is
+in the Latin-1 range *and* those bytes form a valid, different UTF-8
+string — the unambiguous signature of double-encoded UTF-8. Correctly
+stored names (``Formulários``, ``Café``, ``日本語``) raise on one of the
+two steps and are left untouched, so a re-run changes nothing.
+"""
+
+from __future__ import annotations
+
+from django.db import migrations
+
+
+def _repair_mojibake(text):  # type: ignore[no-untyped-def]
+    if not text:
+        return text
+    try:
+        repaired = text.encode('latin-1').decode('utf-8')
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        return text
+    return repaired
+
+
+def _repair_names(apps, schema_editor):  # type: ignore[no-untyped-def]
+    asset_model = apps.get_model('anthias_app', 'Asset')
+    for asset in asset_model.objects.exclude(name__isnull=True):
+        repaired = _repair_mojibake(asset.name)
+        if repaired != asset.name:
+            asset.name = repaired
+            asset.save(update_fields=['name'])
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ('anthias_app', '0006_asset_metadata'),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            _repair_names, reverse_code=migrations.RunPython.noop
+        ),
+    ]
diff --git a/src/anthias_server/app/models.py b/src/anthias_server/app/models.py
index c5d215d98..373c52c56 100644
--- a/src/anthias_server/app/models.py
+++ b/src/anthias_server/app/models.py
@@ -39,6 +39,36 @@ def clamp_refresh_interval(value: Any) -> int:
     return max(0, min(interval, REFRESH_INTERVAL_S_MAX))
 
 
+def repair_mojibake(text: str | None) -> str | None:
+    """Undo the classic ``UTF-8 bytes decoded as Latin-1`` mojibake.
+
+    A misbehaving uploader that double-encodes a filename turns
+    ``Formulários`` into ``FormulÃ¡rios`` (the UTF-8 bytes ``\\xc3\\xa1``
+    of ``á`` read back as the two Latin-1 chars ``Ã`` + ``¡``). Anthias
+    itself never does this — Django's multipart parser and DRF both
+    decode as UTF-8 — but the corrupted text arrives already mangled in
+    the request body and we would otherwise store it verbatim, so the
+    operator sees garbled asset names in the UI and in the viewer's
+    ``Showing asset …`` log line.
+
+    The repair is deliberately conservative and deterministic: it only
+    fires when *every* character is in the Latin-1 range (so
+    ``encode('latin-1')`` round-trips) **and** those bytes form a valid,
+    *different* UTF-8 string. That combination is the unambiguous
+    signature of double-encoded UTF-8 — a correctly-stored
+    ``Formulários``, ``Café``, or ``日本語`` raises on one of the two
+    steps and is returned untouched, so this can't corrupt good data.
+    Idempotent: re-running on already-repaired text is a no-op.
+    """
+    if not text:
+        return text
+    try:
+        repaired = text.encode('latin-1').decode('utf-8')
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        return text
+    return repaired
+
+
 def generate_asset_id() -> str:
     return uuid.uuid4().hex
 
@@ -84,6 +114,21 @@ class Meta:
     def __str__(self) -> str:
         return str(self.name)
 
+    def save(self, *args: Any, **kwargs: Any) -> None:
+        """Repair double-encoded UTF-8 in ``name`` before persisting.
+
+        A single write-side chokepoint so every create/update path —
+        the web form, all four API versions, and the legacy Screenly
+        import — stores a clean name regardless of an upstream client
+        that double-encoded the filename. See ``repair_mojibake`` for
+        why this is safe (no-op on correctly-encoded text). The repair
+        is cheap and idempotent, so running it on every save (including
+        reachability/processing-flag updates that leave ``name``
+        unchanged) costs nothing.
+        """
+        self.name = repair_mojibake(self.name)
+        super().save(*args, **kwargs)
+
     def get_play_days(self) -> list[int]:
         """Parse play_days into a sorted, deduped list of ints 1-7.
 
diff --git a/tests/test_mojibake_repair.py b/tests/test_mojibake_repair.py
new file mode 100644
index 000000000..50f02b92c
--- /dev/null
+++ b/tests/test_mojibake_repair.py
@@ -0,0 +1,88 @@
+"""Tests for the double-encoded-UTF-8 (mojibake) repair on asset names.
+
+Covers the pure helper, the write-side ``Asset.save`` guardrail (which
+catches new uploads from any API version or the web form), and the
+data-migration logic that fixes rows stored before the guardrail
+existed.
+"""
+
+import pytest
+
+from anthias_server.app.models import Asset, repair_mojibake
+
+# ``Formulários`` round-tripped through the classic ``UTF-8 bytes read
+# as Latin-1`` corruption. Kept as the canonical mojibake fixture so the
+# intent is obvious without sprinkling non-ASCII escapes through the
+# assertions.
+GARBLED = 'Formulários'.encode('utf-8').decode('latin-1')
+
+
+@pytest.mark.parametrize(
+    'given, expected',
+    [
+        # Genuine mojibake — the one case we repair.
+        (GARBLED, 'Formulários'),
+        # Correctly-encoded text must survive untouched: a multi-byte
+        # accent, a name with several Latin-1 accents, and CJK each
+        # raise on the encode or decode step and short-circuit.
+        ('Formulários', 'Formulários'),
+        ('Café Über señor', 'Café Über señor'),
+        ('日本語', '日本語'),
+        # ASCII / empty / None are no-ops.
+        ('Plain Name 2', 'Plain Name 2'),
+        ('', ''),
+        (None, None),
+        # A lone Latin-1 lead byte is not valid UTF-8 once re-decoded,
+        # so it is left alone rather than mangled.
+        ('Ã', 'Ã'),
+    ],
+)
+def test_repair_mojibake(given: str | None, expected: str | None) -> None:
+    assert repair_mojibake(given) == expected
+
+
+def test_repair_mojibake_is_idempotent() -> None:
+    once = repair_mojibake(GARBLED)
+    assert repair_mojibake(once) == once == 'Formulários'
+
+
+@pytest.mark.django_db
+def test_save_repairs_mojibake_name() -> None:
+    asset = Asset.objects.create(name=GARBLED, mimetype='image')
+    asset.refresh_from_db()
+    assert asset.name == 'Formulários'
+
+
+@pytest.mark.django_db
+def test_save_leaves_clean_name_untouched() -> None:
+    asset = Asset.objects.create(name='Café Über señor', mimetype='image')
+    asset.refresh_from_db()
+    assert asset.name == 'Café Über señor'
+
+
+@pytest.mark.django_db
+def test_migration_repairs_existing_rows() -> None:
+    """The migration's repair pass fixes pre-existing garbled rows.
+
+    ``Asset.save`` now cleans names on write, so to exercise the
+    migration's own logic against a *stored* mojibake row we write the
+    column directly with ``QuerySet.update`` (which bypasses ``save``).
+    """
+    import importlib
+
+    migration = importlib.import_module(
+        'anthias_server.app.migrations.0007_repair_mojibake_asset_names'
+    )
+
+    asset = Asset.objects.create(name='placeholder', mimetype='image')
+    Asset.objects.filter(pk=asset.pk).update(name=GARBLED)
+
+    class _Apps:
+        @staticmethod
+        def get_model(app_label: str, model_name: str) -> type[Asset]:
+            return Asset
+
+    migration._repair_names(_Apps(), None)
+
+    asset.refresh_from_db()
+    assert asset.name == 'Formulários'

From 236a3b17a542cb6d0cd7ed05d5cdd474cb2fd80e Mon Sep 17 00:00:00 2001
From: Viktor Petersson <vpetersson@screenly.io>
Date: Mon, 1 Jun 2026 15:44:10 +0000
Subject: [PATCH 2/3] fix(assets): address Copilot review on mojibake repair
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Soften the safety claim in repair_mojibake / migration docstrings: the
  Latin-1 + valid-UTF-8 test is a strong heuristic, not a proof. A
  genuinely Latin-1 name whose bytes are also valid UTF-8 (e.g. `Â©` →
  `©`) is indistinguishable from mojibake and gets rewritten; document
  the accepted trade-off instead of overclaiming "can't corrupt good
  data".
- Return the original string when the repair produces no change, to
  match the stated "only rewrites when different" contract.
- Stream the migration's rows with .only('asset_id', 'name').iterator()
  instead of caching the whole assets table in memory.
- Add a regression test pinning the documented `Â©` false positive.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../0007_repair_mojibake_asset_names.py       | 24 +++++++++++++------
 src/anthias_server/app/models.py              | 19 +++++++++------
 tests/test_mojibake_repair.py                 |  5 ++++
 3 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py
index 4dc3ef9a9..d316d0a78 100644
--- a/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py
+++ b/src/anthias_server/app/migrations/0007_repair_mojibake_asset_names.py
@@ -13,11 +13,14 @@
 snapshots of intent, and a future change to the model helper must not
 retroactively alter what this one-time data fix did.
 
-Idempotent and safe: a name is rewritten only when every character is
-in the Latin-1 range *and* those bytes form a valid, different UTF-8
-string — the unambiguous signature of double-encoded UTF-8. Correctly
-stored names (``Formulários``, ``Café``, ``日本語``) raise on one of the
-two steps and are left untouched, so a re-run changes nothing.
+Idempotent: a name is rewritten only when every character is in the
+Latin-1 range *and* those bytes form a valid UTF-8 string that differs
+from the input — a strong heuristic for double-encoded UTF-8, though not
+a proof (a genuinely Latin-1 name whose bytes are also valid UTF-8, e.g.
+``Â©`` → ``©``, is indistinguishable and gets rewritten too; such
+collisions are vanishingly rare in real filenames). Correctly stored
+names (``Formulários``, ``Café``, ``日本語``) raise on the encode or
+decode step and are left untouched, so a re-run changes nothing.
 """
 
 from __future__ import annotations
@@ -32,12 +35,19 @@ def _repair_mojibake(text):  # type: ignore[no-untyped-def]
         repaired = text.encode('latin-1').decode('utf-8')
     except (UnicodeEncodeError, UnicodeDecodeError):
         return text
-    return repaired
+    return repaired if repaired != text else text
 
 
 def _repair_names(apps, schema_editor):  # type: ignore[no-untyped-def]
     asset_model = apps.get_model('anthias_app', 'Asset')
-    for asset in asset_model.objects.exclude(name__isnull=True):
+    # ``.only()`` + ``.iterator()`` streams rows in chunks instead of
+    # caching the whole table in memory; per-row ``save`` still works.
+    rows = (
+        asset_model.objects.exclude(name__isnull=True)
+        .only('asset_id', 'name')
+        .iterator()
+    )
+    for asset in rows:
         repaired = _repair_mojibake(asset.name)
         if repaired != asset.name:
             asset.name = repaired
diff --git a/src/anthias_server/app/models.py b/src/anthias_server/app/models.py
index 373c52c56..fd8c8f5a9 100644
--- a/src/anthias_server/app/models.py
+++ b/src/anthias_server/app/models.py
@@ -53,12 +53,17 @@ def repair_mojibake(text: str | None) -> str | None:
 
     The repair is deliberately conservative and deterministic: it only
     fires when *every* character is in the Latin-1 range (so
-    ``encode('latin-1')`` round-trips) **and** those bytes form a valid,
-    *different* UTF-8 string. That combination is the unambiguous
-    signature of double-encoded UTF-8 — a correctly-stored
-    ``Formulários``, ``Café``, or ``日本語`` raises on one of the two
-    steps and is returned untouched, so this can't corrupt good data.
-    Idempotent: re-running on already-repaired text is a no-op.
+    ``encode('latin-1')`` round-trips) **and** those bytes form a valid
+    UTF-8 string, which is then returned only if it actually differs from
+    the input. That is a strong heuristic for double-encoded UTF-8, but
+    not a proof: a name that is *genuinely* Latin-1 yet whose bytes also
+    happen to be valid UTF-8 (e.g. ``Â©`` → ``©``) is indistinguishable
+    from mojibake and gets rewritten too. Such collisions are vanishingly
+    rare in real asset filenames, and the alternative — leaving every
+    ``FormulÃ¡rios`` garbled — is worse, so we accept the trade-off.
+    Correctly-stored ``Formulários``, ``Café``, or ``日本語`` raise on the
+    encode or decode step and are returned untouched. Idempotent:
+    re-running on already-repaired text is a no-op.
     """
     if not text:
         return text
@@ -66,7 +71,7 @@ def repair_mojibake(text: str | None) -> str | None:
         repaired = text.encode('latin-1').decode('utf-8')
     except (UnicodeEncodeError, UnicodeDecodeError):
         return text
-    return repaired
+    return repaired if repaired != text else text
 
 
 def generate_asset_id() -> str:
diff --git a/tests/test_mojibake_repair.py b/tests/test_mojibake_repair.py
index 50f02b92c..0bbb5c3b0 100644
--- a/tests/test_mojibake_repair.py
+++ b/tests/test_mojibake_repair.py
@@ -35,6 +35,11 @@
         # A lone Latin-1 lead byte is not valid UTF-8 once re-decoded,
         # so it is left alone rather than mangled.
         ('Ã', 'Ã'),
+        # Documented false positive: a genuinely Latin-1 ``Â©`` (U+00C2
+        # U+00A9) has bytes that are also valid UTF-8 (``©``), so it is
+        # indistinguishable from mojibake and gets rewritten. Accepted
+        # trade-off — see ``repair_mojibake``'s docstring.
+        ('Â©', '©'),
     ],
 )
 def test_repair_mojibake(given: str | None, expected: str | None) -> None:

From e2e6b2ec4a960b31dcd602dd21498137b432a35a Mon Sep 17 00:00:00 2001
From: Viktor Petersson <vpetersson@screenly.io>
Date: Mon, 1 Jun 2026 15:57:02 +0000
Subject: [PATCH 3/3] fix(assets): skip name repair on update_fields saves
 without name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Asset.save() repaired self.name unconditionally, so a metadata-only or
reachability save (update_fields excluding 'name', e.g. v2 views'
save(update_fields=['metadata'])) would mutate the in-memory name
without writing the column — diverging the instance from the stored row
and leaving any DB-side garble unrepaired. Repair only when name is
actually persisted (update_fields is None or includes 'name'), and add a
regression test for the metadata-only path.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/anthias_server/app/models.py | 15 ++++++++++-----
 tests/test_mojibake_repair.py    | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/src/anthias_server/app/models.py b/src/anthias_server/app/models.py
index fd8c8f5a9..e71978af3 100644
--- a/src/anthias_server/app/models.py
+++ b/src/anthias_server/app/models.py
@@ -126,12 +126,17 @@ def save(self, *args: Any, **kwargs: Any) -> None:
         the web form, all four API versions, and the legacy Screenly
         import — stores a clean name regardless of an upstream client
         that double-encoded the filename. See ``repair_mojibake`` for
-        why this is safe (no-op on correctly-encoded text). The repair
-        is cheap and idempotent, so running it on every save (including
-        reachability/processing-flag updates that leave ``name``
-        unchanged) costs nothing.
+        why this is safe (no-op on correctly-encoded text).
+
+        Skipped when ``update_fields`` is passed without ``name`` (e.g.
+        the metadata-only and reachability saves): those writes don't
+        touch the column, so repairing ``self.name`` there would mutate
+        the in-memory instance without persisting it — diverging from
+        the stored row while leaving the DB value unrepaired.
         """
-        self.name = repair_mojibake(self.name)
+        update_fields = kwargs.get('update_fields')
+        if update_fields is None or 'name' in update_fields:
+            self.name = repair_mojibake(self.name)
         super().save(*args, **kwargs)
 
     def get_play_days(self) -> list[int]:
diff --git a/tests/test_mojibake_repair.py b/tests/test_mojibake_repair.py
index 0bbb5c3b0..4c6000a0d 100644
--- a/tests/test_mojibake_repair.py
+++ b/tests/test_mojibake_repair.py
@@ -65,6 +65,28 @@ def test_save_leaves_clean_name_untouched() -> None:
     assert asset.name == 'Café Über señor'
 
 
+@pytest.mark.django_db
+def test_save_skips_repair_when_name_not_in_update_fields() -> None:
+    """A metadata-only save must not silently rewrite ``name``.
+
+    With ``update_fields`` excluding ``name`` the column isn't written,
+    so repairing ``self.name`` would only diverge the in-memory instance
+    from the stored row. The repair is skipped and the stored value is
+    left as-is.
+    """
+    asset = Asset.objects.create(name='placeholder', mimetype='image')
+    Asset.objects.filter(pk=asset.pk).update(name=GARBLED)
+    asset.refresh_from_db()
+
+    asset.metadata = {'foo': 'bar'}
+    asset.save(update_fields=['metadata'])
+
+    # In-memory name untouched, and the DB still holds the raw value.
+    assert asset.name == GARBLED
+    asset.refresh_from_db()
+    assert asset.name == GARBLED
+
+
 @pytest.mark.django_db
 def test_migration_repairs_existing_rows() -> None:
     """The migration's repair pass fixes pre-existing garbled rows.