Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""One-time repair of double-encoded UTF-8 in existing ``Asset.name``.

A misbehaving uploader that double-encodes a filename stores e.g.
``Formulários`` instead of ``Formulários`` (the UTF-8 bytes of ``á``
read back as the Latin-1 chars ``Ã`` + ``¡``). Anthias never produces
this itself, but it stored whatever the request body carried, so
already-uploaded assets keep the garbled name in the UI and in the
viewer's ``Showing asset …`` log line. ``Asset.save`` now repairs new
writes; this migration fixes the rows that pre-date that guardrail.

The repair logic is inlined rather than imported from
``anthias_server.app.models`` on purpose — migrations are frozen
snapshots of intent, and a future change to the model helper must not
retroactively alter what this one-time data fix did.

Idempotent and safe: a name is rewritten only when every character is
in the Latin-1 range *and* those bytes form a valid, different UTF-8
string — the unambiguous signature of double-encoded UTF-8. Correctly
stored names (``Formulários``, ``Café``, ``日本語``) raise on one of the
two steps and are left untouched, so a re-run changes nothing.
Comment thread
vpetersson marked this conversation as resolved.
Outdated
"""

from __future__ import annotations

from django.db import migrations


def _repair_mojibake(text): # type: ignore[no-untyped-def]
if not text:
return text
try:
repaired = text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
return repaired


def _repair_names(apps, schema_editor): # type: ignore[no-untyped-def]
asset_model = apps.get_model('anthias_app', 'Asset')
for asset in asset_model.objects.exclude(name__isnull=True):
repaired = _repair_mojibake(asset.name)
if repaired != asset.name:
asset.name = repaired
asset.save(update_fields=['name'])

Comment thread
vpetersson marked this conversation as resolved.

class Migration(migrations.Migration):
dependencies = [
('anthias_app', '0006_asset_metadata'),
]
Comment on lines +57 to +60

operations = [
migrations.RunPython(
_repair_names, reverse_code=migrations.RunPython.noop
),
]
45 changes: 45 additions & 0 deletions src/anthias_server/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,36 @@ def clamp_refresh_interval(value: Any) -> int:
return max(0, min(interval, REFRESH_INTERVAL_S_MAX))


def repair_mojibake(text: str | None) -> str | None:
"""Undo the classic ``UTF-8 bytes decoded as Latin-1`` mojibake.

A misbehaving uploader that double-encodes a filename turns
``Formulários`` into ``Formulários`` (the UTF-8 bytes ``\\xc3\\xa1``
of ``á`` read back as the two Latin-1 chars ``Ã`` + ``¡``). Anthias
itself never does this — Django's multipart parser and DRF both
decode as UTF-8 — but the corrupted text arrives already mangled in
the request body and we would otherwise store it verbatim, so the
operator sees garbled asset names in the UI and in the viewer's
``Showing asset …`` log line.

The repair is deliberately conservative and deterministic: it only
fires when *every* character is in the Latin-1 range (so
``encode('latin-1')`` round-trips) **and** those bytes form a valid,
*different* UTF-8 string. That combination is the unambiguous
signature of double-encoded UTF-8 — a correctly-stored
``Formulários``, ``Café``, or ``日本語`` raises on one of the two
steps and is returned untouched, so this can't corrupt good data.
Comment thread
vpetersson marked this conversation as resolved.
Outdated
Idempotent: re-running on already-repaired text is a no-op.
"""
if not text:
return text
try:
repaired = text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
return repaired
Comment thread
vpetersson marked this conversation as resolved.
Outdated


def generate_asset_id() -> str:
return uuid.uuid4().hex

Expand Down Expand Up @@ -84,6 +114,21 @@ class Meta:
def __str__(self) -> str:
return str(self.name)

def save(self, *args: Any, **kwargs: Any) -> None:
"""Repair double-encoded UTF-8 in ``name`` before persisting.

A single write-side chokepoint so every create/update path —
the web form, all four API versions, and the legacy Screenly
import — stores a clean name regardless of an upstream client
that double-encoded the filename. See ``repair_mojibake`` for
why this is safe (no-op on correctly-encoded text). The repair
is cheap and idempotent, so running it on every save (including
reachability/processing-flag updates that leave ``name``
unchanged) costs nothing.
"""
self.name = repair_mojibake(self.name)
super().save(*args, **kwargs)
Comment thread
vpetersson marked this conversation as resolved.

def get_play_days(self) -> list[int]:
"""Parse play_days into a sorted, deduped list of ints 1-7.

Expand Down
88 changes: 88 additions & 0 deletions tests/test_mojibake_repair.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Tests for the double-encoded-UTF-8 (mojibake) repair on asset names.

Covers the pure helper, the write-side ``Asset.save`` guardrail (which
catches new uploads from any API version or the web form), and the
data-migration logic that fixes rows stored before the guardrail
existed.
"""

import pytest

from anthias_server.app.models import Asset, repair_mojibake

# ``Formulários`` round-tripped through the classic ``UTF-8 bytes read
# as Latin-1`` corruption. Kept as the canonical mojibake fixture so the
# intent is obvious without sprinkling non-ASCII escapes through the
# assertions.
GARBLED = 'Formulários'.encode('utf-8').decode('latin-1')


@pytest.mark.parametrize(
'given, expected',
[
# Genuine mojibake — the one case we repair.
(GARBLED, 'Formulários'),
# Correctly-encoded text must survive untouched: a multi-byte
# accent, a name with several Latin-1 accents, and CJK each
# raise on the encode or decode step and short-circuit.
('Formulários', 'Formulários'),
('Café Über señor', 'Café Über señor'),
('日本語', '日本語'),
# ASCII / empty / None are no-ops.
('Plain Name 2', 'Plain Name 2'),
('', ''),
(None, None),
# A lone Latin-1 lead byte is not valid UTF-8 once re-decoded,
# so it is left alone rather than mangled.
('Ã', 'Ã'),
],
)
def test_repair_mojibake(given: str | None, expected: str | None) -> None:
assert repair_mojibake(given) == expected


def test_repair_mojibake_is_idempotent() -> None:
once = repair_mojibake(GARBLED)
assert repair_mojibake(once) == once == 'Formulários'


@pytest.mark.django_db
def test_save_repairs_mojibake_name() -> None:
asset = Asset.objects.create(name=GARBLED, mimetype='image')
asset.refresh_from_db()
assert asset.name == 'Formulários'


@pytest.mark.django_db
def test_save_leaves_clean_name_untouched() -> None:
asset = Asset.objects.create(name='Café Über señor', mimetype='image')
asset.refresh_from_db()
assert asset.name == 'Café Über señor'


@pytest.mark.django_db
def test_migration_repairs_existing_rows() -> None:
"""The migration's repair pass fixes pre-existing garbled rows.

``Asset.save`` now cleans names on write, so to exercise the
migration's own logic against a *stored* mojibake row we write the
column directly with ``QuerySet.update`` (which bypasses ``save``).
"""
import importlib

migration = importlib.import_module(
'anthias_server.app.migrations.0007_repair_mojibake_asset_names'
)

asset = Asset.objects.create(name='placeholder', mimetype='image')
Asset.objects.filter(pk=asset.pk).update(name=GARBLED)

class _Apps:
@staticmethod
def get_model(app_label: str, model_name: str) -> type[Asset]:
return Asset

migration._repair_names(_Apps(), None)

asset.refresh_from_db()
assert asset.name == 'Formulários'