Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""One-time repair of double-encoded UTF-8 in existing ``Asset.name``.

A misbehaving uploader that double-encodes a filename stores e.g.
``Formulários`` instead of ``Formulários`` (the UTF-8 bytes of ``á``
read back as the Latin-1 chars ``Ã`` + ``¡``). Anthias never produces
this itself, but it stored whatever the request body carried, so
already-uploaded assets keep the garbled name in the UI and in the
viewer's ``Showing asset …`` log line. ``Asset.save`` now repairs new
writes; this migration fixes the rows that pre-date that guardrail.

The repair logic is inlined rather than imported from
``anthias_server.app.models`` on purpose — migrations are frozen
snapshots of intent, and a future change to the model helper must not
retroactively alter what this one-time data fix did.

Idempotent: a name is rewritten only when every character is in the
Latin-1 range *and* those bytes form a valid UTF-8 string that differs
from the input — a strong heuristic for double-encoded UTF-8, though not
a proof (a genuinely Latin-1 name whose bytes are also valid UTF-8, e.g.
``©`` → ``©``, is indistinguishable and gets rewritten too; such
collisions are vanishingly rare in real filenames). Correctly stored
names (``Formulários``, ``Café``, ``日本語``) raise on the encode or
decode step and are left untouched, so a re-run changes nothing.
"""

from __future__ import annotations

from django.db import migrations


def _repair_mojibake(text): # type: ignore[no-untyped-def]
if not text:
return text
try:
repaired = text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
return repaired if repaired != text else text


def _repair_names(apps, schema_editor): # type: ignore[no-untyped-def]
asset_model = apps.get_model('anthias_app', 'Asset')
# ``.only()`` + ``.iterator()`` streams rows in chunks instead of
# caching the whole table in memory; per-row ``save`` still works.
rows = (
asset_model.objects.exclude(name__isnull=True)
.only('asset_id', 'name')
.iterator()
)
for asset in rows:
repaired = _repair_mojibake(asset.name)
if repaired != asset.name:
asset.name = repaired
asset.save(update_fields=['name'])

Comment thread
vpetersson marked this conversation as resolved.

class Migration(migrations.Migration):
dependencies = [
('anthias_app', '0006_asset_metadata'),
]
Comment on lines +57 to +60

operations = [
migrations.RunPython(
_repair_names, reverse_code=migrations.RunPython.noop
),
]
55 changes: 55 additions & 0 deletions src/anthias_server/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,41 @@ def clamp_refresh_interval(value: Any) -> int:
return max(0, min(interval, REFRESH_INTERVAL_S_MAX))


def repair_mojibake(text: str | None) -> str | None:
"""Undo the classic ``UTF-8 bytes decoded as Latin-1`` mojibake.

A misbehaving uploader that double-encodes a filename turns
``Formulários`` into ``Formulários`` (the UTF-8 bytes ``\\xc3\\xa1``
of ``á`` read back as the two Latin-1 chars ``Ã`` + ``¡``). Anthias
itself never does this — Django's multipart parser and DRF both
decode as UTF-8 — but the corrupted text arrives already mangled in
the request body and we would otherwise store it verbatim, so the
operator sees garbled asset names in the UI and in the viewer's
``Showing asset …`` log line.

The repair is deliberately conservative and deterministic: it only
fires when *every* character is in the Latin-1 range (so
``encode('latin-1')`` round-trips) **and** those bytes form a valid
UTF-8 string, which is then returned only if it actually differs from
the input. That is a strong heuristic for double-encoded UTF-8, but
not a proof: a name that is *genuinely* Latin-1 yet whose bytes also
happen to be valid UTF-8 (e.g. ``©`` → ``©``) is indistinguishable
from mojibake and gets rewritten too. Such collisions are vanishingly
rare in real asset filenames, and the alternative — leaving every
``Formulários`` garbled — is worse, so we accept the trade-off.
Correctly-stored ``Formulários``, ``Café``, or ``日本語`` raise on the
encode or decode step and are returned untouched. Idempotent:
re-running on already-repaired text is a no-op.
"""
if not text:
return text
try:
repaired = text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
return repaired if repaired != text else text


def generate_asset_id() -> str:
return uuid.uuid4().hex

Expand Down Expand Up @@ -84,6 +119,26 @@ class Meta:
def __str__(self) -> str:
return str(self.name)

def save(self, *args: Any, **kwargs: Any) -> None:
"""Repair double-encoded UTF-8 in ``name`` before persisting.

A single write-side chokepoint so every create/update path —
the web form, all four API versions, and the legacy Screenly
import — stores a clean name regardless of an upstream client
that double-encoded the filename. See ``repair_mojibake`` for
why this is safe (no-op on correctly-encoded text).

Skipped when ``update_fields`` is passed without ``name`` (e.g.
the metadata-only and reachability saves): those writes don't
touch the column, so repairing ``self.name`` there would mutate
the in-memory instance without persisting it — diverging from
the stored row while leaving the DB value unrepaired.
"""
update_fields = kwargs.get('update_fields')
if update_fields is None or 'name' in update_fields:
self.name = repair_mojibake(self.name)
super().save(*args, **kwargs)
Comment thread
vpetersson marked this conversation as resolved.

def get_play_days(self) -> list[int]:
"""Parse play_days into a sorted, deduped list of ints 1-7.

Expand Down
115 changes: 115 additions & 0 deletions tests/test_mojibake_repair.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Tests for the double-encoded-UTF-8 (mojibake) repair on asset names.

Covers the pure helper, the write-side ``Asset.save`` guardrail (which
catches new uploads from any API version or the web form), and the
data-migration logic that fixes rows stored before the guardrail
existed.
"""

import pytest

from anthias_server.app.models import Asset, repair_mojibake

# ``Formulários`` round-tripped through the classic ``UTF-8 bytes read
# as Latin-1`` corruption. Kept as the canonical mojibake fixture so the
# intent is obvious without sprinkling non-ASCII escapes through the
# assertions.
GARBLED = 'Formulários'.encode('utf-8').decode('latin-1')


@pytest.mark.parametrize(
'given, expected',
[
# Genuine mojibake — the one case we repair.
(GARBLED, 'Formulários'),
# Correctly-encoded text must survive untouched: a multi-byte
# accent, a name with several Latin-1 accents, and CJK each
# raise on the encode or decode step and short-circuit.
('Formulários', 'Formulários'),
('Café Über señor', 'Café Über señor'),
('日本語', '日本語'),
# ASCII / empty / None are no-ops.
('Plain Name 2', 'Plain Name 2'),
('', ''),
(None, None),
# A lone Latin-1 lead byte is not valid UTF-8 once re-decoded,
# so it is left alone rather than mangled.
('Ã', 'Ã'),
# Documented false positive: a genuinely Latin-1 ``©`` (U+00C2
# U+00A9) has bytes that are also valid UTF-8 (``©``), so it is
# indistinguishable from mojibake and gets rewritten. Accepted
# trade-off — see ``repair_mojibake``'s docstring.
('©', '©'),
],
)
def test_repair_mojibake(given: str | None, expected: str | None) -> None:
assert repair_mojibake(given) == expected


def test_repair_mojibake_is_idempotent() -> None:
once = repair_mojibake(GARBLED)
assert repair_mojibake(once) == once == 'Formulários'


@pytest.mark.django_db
def test_save_repairs_mojibake_name() -> None:
asset = Asset.objects.create(name=GARBLED, mimetype='image')
asset.refresh_from_db()
assert asset.name == 'Formulários'


@pytest.mark.django_db
def test_save_leaves_clean_name_untouched() -> None:
asset = Asset.objects.create(name='Café Über señor', mimetype='image')
asset.refresh_from_db()
assert asset.name == 'Café Über señor'


@pytest.mark.django_db
def test_save_skips_repair_when_name_not_in_update_fields() -> None:
"""A metadata-only save must not silently rewrite ``name``.

With ``update_fields`` excluding ``name`` the column isn't written,
so repairing ``self.name`` would only diverge the in-memory instance
from the stored row. The repair is skipped and the stored value is
left as-is.
"""
asset = Asset.objects.create(name='placeholder', mimetype='image')
Asset.objects.filter(pk=asset.pk).update(name=GARBLED)
asset.refresh_from_db()

asset.metadata = {'foo': 'bar'}
asset.save(update_fields=['metadata'])

# In-memory name untouched, and the DB still holds the raw value.
assert asset.name == GARBLED
asset.refresh_from_db()
assert asset.name == GARBLED


@pytest.mark.django_db
def test_migration_repairs_existing_rows() -> None:
"""The migration's repair pass fixes pre-existing garbled rows.

``Asset.save`` now cleans names on write, so to exercise the
migration's own logic against a *stored* mojibake row we write the
column directly with ``QuerySet.update`` (which bypasses ``save``).
"""
import importlib

migration = importlib.import_module(
'anthias_server.app.migrations.0007_repair_mojibake_asset_names'
)

asset = Asset.objects.create(name='placeholder', mimetype='image')
Asset.objects.filter(pk=asset.pk).update(name=GARBLED)

class _Apps:
@staticmethod
def get_model(app_label: str, model_name: str) -> type[Asset]:
return Asset

migration._repair_names(_Apps(), None)

asset.refresh_from_db()
assert asset.name == 'Formulários'