Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,10 @@ jobs:
- name: Run tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: pytest -v litgpt/ tests/ --timeout=180 --durations=100
# TEMP: scoped to test_tokenizer.py only to iterate on the fixture fallback in CI.
# `-rs` prints skip reasons; `-s` surfaces the [fixtures] resolution markers.
# Revert to `pytest -v litgpt/ tests/ --timeout=180 --durations=100` before merging.
run: pytest -sv -rs tests/test_tokenizer.py --timeout=180 --durations=100

- name: Show cache
run: uvx py-tree -d 1 .cache-HF
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,6 @@ events.out.tfevents*
**/custom_finetuning_dataset.json
client.py
**/custom_texts/

# staging dir for tests/publish_fixtures.py
litgpt-ci-fixtures/
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ optional-dependencies.extra = [
]
optional-dependencies.test = [
"einops>=0.7",
"litmodels>=0.1.8", # CI-safe tokenizer/config fixtures for gated HF repos (no secrets needed)
"protobuf>=4.23.4",
"pytest>=8.1.1",
"pytest-benchmark>=5.1",
Expand Down
178 changes: 178 additions & 0 deletions tests/_fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
"""CI-safe resolution of tokenizer/config assets for parity tests.

Downloads from Hugging Face, falling back to a public Lightning Model Registry mirror for
gated repos when `HF_TOKEN` is unavailable (e.g. fork PRs), and skips when neither works.
"""

import os
import shutil
from pathlib import Path

import pytest
from huggingface_hub import snapshot_download
from huggingface_hub.errors import GatedRepoError
from transformers import AutoTokenizer

# Tokenizer/config files mirrored for CI. This must stay a superset of what litgpt's
# `Tokenizer` reads (tokenizer.json/model, tokenizer_config.json, generation_config.json)
# and contain enough for `AutoTokenizer.from_pretrained(local_dir)` to load the mirror.
TOKENIZER_FILES = (
"tokenizer.json",
"tokenizer.model",
"tokenizer_config.json",
"generation_config.json",
"special_tokens_map.json",
"added_tokens.json",
"vocab.json",
"merges.txt",
"config.json",
)

# At least one of these is required to build a tokenizer.
_REQUIRED_TOKENIZER_FILES = ("tokenizer.json", "tokenizer.model")

_FIXTURE_TEAMSPACE = "lightning-ai/oss-litgpt"
_FIXTURE_VERSION = "v1"

# Hugging Face repos that are public but gated behind license acceptance
# (verified against the unauthenticated HF model API on 2026-06-10). Keep this in
# sync by regenerating periodically; gating status changes over time.
GATED_TOKENIZER_REPOS = (
"stabilityai/stablecode-instruct-alpha-3b",
"tiiuae/falcon-180B",
"tiiuae/falcon-180B-chat",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-13b-hf",
"meta-llama/Llama-2-13b-chat-hf",
"meta-llama/Llama-2-70b-hf",
"meta-llama/Llama-2-70b-chat-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Meta-Llama-3.1-8B",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Meta-Llama-3-70B",
"meta-llama/Meta-Llama-3-70B-Instruct",
"meta-llama/Meta-Llama-3.1-70B",
"meta-llama/Meta-Llama-3.1-70B-Instruct",
"meta-llama/Meta-Llama-3.1-405B",
"meta-llama/Meta-Llama-3.1-405B-Instruct",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-3B",
"meta-llama/Llama-3.2-3B-Instruct",
"meta-llama/Llama-3.3-70B-Instruct",
"google/gemma-2b",
"google/gemma-7b",
"google/gemma-2-2b",
"google/gemma-2-9b",
"google/gemma-2-27b",
"google/gemma-2b-it",
"google/gemma-7b-it",
"google/gemma-2-2b-it",
"google/gemma-2-9b-it",
"google/gemma-2-27b-it",
"google/gemma-3-1b-it",
"google/gemma-3-4b-it",
"google/gemma-3-12b-it",
"google/gemma-3-27b-it",
"google/codegemma-7b-it",
"mistralai/Mistral-Large-Instruct-2407",
)


def fixture_slug(repo_id: str) -> str:
"""Maps a HF repo id to a registry-safe model name component."""
return repo_id.lower().replace("/", "--").replace(".", "-")


def fixture_name(repo_id: str) -> str:
"""Returns the pinned Lightning Model Registry name for a gated repo's tokenizer mirror."""
return f"{_FIXTURE_TEAMSPACE}/{fixture_slug(repo_id)}-tokenizer:{_FIXTURE_VERSION}"


# Explicit, version-pinned map from HF repo id to its Lightning Registry mirror. Do not
# use floating/latest versions in CI; bump `_FIXTURE_VERSION` when re-uploading fixtures.
HF_TO_LIGHTNING_TOKENIZER_FIXTURE = {repo: fixture_name(repo) for repo in GATED_TOKENIZER_REPOS}


def _is_hf_auth_error(ex: Exception) -> bool:
"""Returns True when HF refused the download because the repo is gated/unauthorized."""
if isinstance(ex, GatedRepoError):
return True
status = getattr(getattr(ex, "response", None), "status_code", None)
return status in (401, 403)


def _populate_from_hf(repo_id: str, model_dir: Path) -> None:
"""Downloads available tokenizer/config files from Hugging Face into `model_dir`."""
# `snapshot_download` raises `GatedRepoError` for gated repos so the caller can fall back
# to the mirror, unlike transformers' `cached_file` which wraps it in a bare `OSError`.
snapshot_download(
repo_id,
local_dir=model_dir,
allow_patterns=list(TOKENIZER_FILES),
token=os.getenv("HF_TOKEN"),
)
present = {p.name for p in model_dir.iterdir()}
if not any(name in present for name in _REQUIRED_TOKENIZER_FILES):
raise ConnectionError(f"Unable to download any tokenizer files from HF for {repo_id}")
print(f"[fixtures] {repo_id}: resolved via Hugging Face", flush=True)


def _populate_from_lightning_registry(repo_id: str, model_dir: Path) -> None:
"""Downloads the registry mirror for a gated repo into `model_dir`, or skips if unavailable."""
fixture = HF_TO_LIGHTNING_TOKENIZER_FIXTURE.get(repo_id)
if fixture is None:
pytest.skip(
f"{repo_id} is gated on Hugging Face and HF_TOKEN is unavailable; "
"no Lightning Model Registry fixture is mapped for it."
)
try:
from litmodels import download_model
except ImportError:
pytest.skip(f"{repo_id} is gated and `litmodels` is not installed for the registry fallback.")

if model_dir.exists():
shutil.rmtree(model_dir)
model_dir.mkdir(parents=True, exist_ok=True)
try:
download_model(name=fixture, download_dir=str(model_dir))
print(f"[fixtures] {repo_id}: resolved via Lightning Model Registry fallback ({fixture})", flush=True)
except Exception as ex:
# This path is only reached on runs without HF_TOKEN (e.g. fork PRs). A missing or
# unreachable mirror should skip gracefully rather than fail the job; internal/main
# runs have HF_TOKEN and never get here.
print(
f"[fixtures] {repo_id}: failed to resolve from Lightning Model Registry fallback ({fixture}): {ex}",
flush=True,
)
pytest.skip(f"Could not fetch Lightning Model Registry fixture '{fixture}' for {repo_id}: {ex}")


def prepare_reference_tokenizer(repo_id: str, model_dir: Path) -> AutoTokenizer:
"""Populates `model_dir` with tokenizer/config files and returns the reference HF tokenizer.

Args:
repo_id: The Hugging Face repo id to resolve, e.g. `EleutherAI/pythia-14m`.
model_dir: Directory to (re)create and populate with the resolved files.

Returns:
The reference `AutoTokenizer` loaded from the repo (or the registry mirror).
"""
model_dir = Path(model_dir)
if model_dir.exists():
shutil.rmtree(model_dir)
model_dir.mkdir(parents=True, exist_ok=True)

try:
_populate_from_hf(repo_id, model_dir)
return AutoTokenizer.from_pretrained(repo_id, token=os.getenv("HF_TOKEN"))
except Exception as ex:
if not _is_hf_auth_error(ex):
raise

# Gated repo without a usable HF_TOKEN: use the CI mirror instead.
_populate_from_lightning_registry(repo_id, model_dir)
return AutoTokenizer.from_pretrained(model_dir)
55 changes: 55 additions & 0 deletions tests/publish_fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
"""Maintainer-only: publishes gated HF tokenizer/config fixtures to the Lightning Model Registry."""

import argparse
from pathlib import Path

import litdata
from _fixtures import GATED_TOKENIZER_REPOS, TOKENIZER_FILES, fixture_name, fixture_slug
from huggingface_hub import snapshot_download


def publish_fixture(repo_id: str, output_dir: str) -> None:
"""Downloads `repo_id`'s tokenizer/config files and publishes them as a registry model."""
from litmodels import upload_model

fixture_dir = Path(output_dir) / f"{fixture_slug(repo_id)}-tokenizer"
snapshot_download(repo_id, local_dir=fixture_dir, allow_patterns=list(TOKENIZER_FILES))
upload_model(name=fixture_name(repo_id), model=str(fixture_dir), progress_bar=False, verbose=0)


def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"repos",
nargs="*",
default=list(GATED_TOKENIZER_REPOS),
help="HF repo ids to publish (default: all gated repos in the fixture map).",
)
parser.add_argument(
"--staging-dir",
default="litgpt-ci-fixtures",
help="Local directory to stage downloaded files before upload.",
)
parser.add_argument(
"--workers",
type=int,
default=4,
help="How many repos to publish in parallel (downloads/uploads are I/O bound).",
)
args = parser.parse_args()

unknown = [r for r in args.repos if r not in GATED_TOKENIZER_REPOS]
if unknown:
parser.error(f"Not in the gated fixture map: {unknown}")

litdata.map(
fn=publish_fixture,
inputs=list(args.repos),
output_dir=args.staging_dir,
num_workers=args.workers,
)


if __name__ == "__main__":
main()
32 changes: 6 additions & 26 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
import os
import shutil
import warnings
from types import SimpleNamespace
from unittest import mock

import pytest
from _fixtures import prepare_reference_tokenizer
from tokenizers import Tokenizer as HFTokenizer
from tokenizers.models import BPE
from transformers import AutoTokenizer
from transformers.utils import cached_file

import litgpt.config as config_module
from litgpt import PromptStyle, Tokenizer
Expand All @@ -22,28 +18,12 @@ def test_tokenizer_against_hf(config, tmp_path):
config = config_module.Config(**config)

repo_id = f"{config.hf_config['org']}/{config.hf_config['name']}"
theirs = AutoTokenizer.from_pretrained(repo_id, token=os.getenv("HF_TOKEN"))

# create a checkpoint directory that points to the HF files
hf_files = {}
for filename in ("tokenizer.json", "generation_config.json", "tokenizer.model", "tokenizer_config.json"):
try: # download the HF tokenizer config
hf_file = cached_file(path_or_repo_id=repo_id, filename=filename)
hf_files[filename] = str(hf_file)
except Exception as ex:
warnings.warn(str(ex), RuntimeWarning)
if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files:
raise ConnectionError("Unable to download any tokenizer files from HF")

# Create a clean, model-specific subdirectory for this test run.
# This avoids errors if previous runs or retries left files behind, ensuring the directory is always ready for fresh downloads and comparisons.
model_dir = tmp_path / config.hf_config["name"]
if model_dir.exists():
shutil.rmtree(model_dir)
os.makedirs(model_dir, exist_ok=True)

for filename, hf_file in hf_files.items():
shutil.copy(hf_file, model_dir / filename)
# Populate a clean, model-specific subdirectory with tokenizer/config files and get the
# reference HF tokenizer. Falls back to the Lightning Model Registry for gated repos
# without HF_TOKEN (e.g. fork PRs), and skips when no mirror exists for a gated repo.
model_dir = tmp_path / config.hf_config["name"]
theirs = prepare_reference_tokenizer(repo_id, model_dir)

ours = Tokenizer(model_dir)

Expand Down
Loading