Lightning-AI · bhimrazy · Jun 11, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
@@ -120,7 +120,10 @@ jobs:
       - name: Run tests
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: pytest -v litgpt/ tests/ --timeout=180 --durations=100
+        # TEMP: scoped to test_tokenizer.py only to iterate on the fixture fallback in CI.
+        # `-rs` prints skip reasons; `-s` surfaces the [fixtures] resolution markers.
+        # Revert to `pytest -v litgpt/ tests/ --timeout=180 --durations=100` before merging.
+        run: pytest -sv -rs tests/test_tokenizer.py --timeout=180 --durations=100
 
       - name: Show cache
         run: uvx py-tree -d 1 .cache-HF

@@ -24,3 +24,6 @@ events.out.tfevents*
 **/custom_finetuning_dataset.json
 client.py
 **/custom_texts/
+
+# staging dir for tests/publish_fixtures.py
+litgpt-ci-fixtures/
@@ -74,6 +74,7 @@ optional-dependencies.extra = [
 ]
 optional-dependencies.test = [
   "einops>=0.7",
+  "litmodels>=0.1.8",         # CI-safe tokenizer/config fixtures for gated HF repos (no secrets needed)
   "protobuf>=4.23.4",
   "pytest>=8.1.1",
   "pytest-benchmark>=5.1",

@@ -0,0 +1,178 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+"""CI-safe resolution of tokenizer/config assets for parity tests.
+
+Downloads from Hugging Face, falling back to a public Lightning Model Registry mirror for
+gated repos when `HF_TOKEN` is unavailable (e.g. fork PRs), and skips when neither works.
+"""
+
+import os
+import shutil
+from pathlib import Path
+
+import pytest
+from huggingface_hub import snapshot_download
+from huggingface_hub.errors import GatedRepoError
+from transformers import AutoTokenizer
+
+# Tokenizer/config files mirrored for CI. This must stay a superset of what litgpt's
+# `Tokenizer` reads (tokenizer.json/model, tokenizer_config.json, generation_config.json)
+# and contain enough for `AutoTokenizer.from_pretrained(local_dir)` to load the mirror.
+TOKENIZER_FILES = (
+    "tokenizer.json",
+    "tokenizer.model",
+    "tokenizer_config.json",
+    "generation_config.json",
+    "special_tokens_map.json",
+    "added_tokens.json",
+    "vocab.json",
+    "merges.txt",
+    "config.json",
+)
+
+# At least one of these is required to build a tokenizer.
+_REQUIRED_TOKENIZER_FILES = ("tokenizer.json", "tokenizer.model")
+
+_FIXTURE_TEAMSPACE = "lightning-ai/oss-litgpt"
+_FIXTURE_VERSION = "v1"
+
+# Hugging Face repos that are public but gated behind license acceptance
+# (verified against the unauthenticated HF model API on 2026-06-10). Keep this in
+# sync by regenerating periodically; gating status changes over time.
+GATED_TOKENIZER_REPOS = (
+    "stabilityai/stablecode-instruct-alpha-3b",
+    "tiiuae/falcon-180B",
+    "tiiuae/falcon-180B-chat",
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-2-7b-chat-hf",
+    "meta-llama/Llama-2-13b-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "meta-llama/Llama-2-70b-hf",
+    "meta-llama/Llama-2-70b-chat-hf",
+    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    "meta-llama/Meta-Llama-3.1-8B",
+    "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "meta-llama/Meta-Llama-3-70B",
+    "meta-llama/Meta-Llama-3-70B-Instruct",
+    "meta-llama/Meta-Llama-3.1-70B",
+    "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    "meta-llama/Meta-Llama-3.1-405B",
+    "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Llama-3.2-3B",
+    "meta-llama/Llama-3.2-3B-Instruct",
+    "meta-llama/Llama-3.3-70B-Instruct",
+    "google/gemma-2b",
+    "google/gemma-7b",
+    "google/gemma-2-2b",
+    "google/gemma-2-9b",
+    "google/gemma-2-27b",
+    "google/gemma-2b-it",
+    "google/gemma-7b-it",
+    "google/gemma-2-2b-it",
+    "google/gemma-2-9b-it",
+    "google/gemma-2-27b-it",
+    "google/gemma-3-1b-it",
+    "google/gemma-3-4b-it",
+    "google/gemma-3-12b-it",
+    "google/gemma-3-27b-it",
+    "google/codegemma-7b-it",
+    "mistralai/Mistral-Large-Instruct-2407",
+)
+
+
+def fixture_slug(repo_id: str) -> str:
+    """Maps a HF repo id to a registry-safe model name component."""
+    return repo_id.lower().replace("/", "--").replace(".", "-")
+
+
+def fixture_name(repo_id: str) -> str:
+    """Returns the pinned Lightning Model Registry name for a gated repo's tokenizer mirror."""
+    return f"{_FIXTURE_TEAMSPACE}/{fixture_slug(repo_id)}-tokenizer:{_FIXTURE_VERSION}"
+
+
+# Explicit, version-pinned map from HF repo id to its Lightning Registry mirror. Do not
+# use floating/latest versions in CI; bump `_FIXTURE_VERSION` when re-uploading fixtures.
+HF_TO_LIGHTNING_TOKENIZER_FIXTURE = {repo: fixture_name(repo) for repo in GATED_TOKENIZER_REPOS}
+
+
+def _is_hf_auth_error(ex: Exception) -> bool:
+    """Returns True when HF refused the download because the repo is gated/unauthorized."""
+    if isinstance(ex, GatedRepoError):
+        return True
+    status = getattr(getattr(ex, "response", None), "status_code", None)
+    return status in (401, 403)
+
+
+def _populate_from_hf(repo_id: str, model_dir: Path) -> None:
+    """Downloads available tokenizer/config files from Hugging Face into `model_dir`."""
+    # `snapshot_download` raises `GatedRepoError` for gated repos so the caller can fall back
+    # to the mirror, unlike transformers' `cached_file` which wraps it in a bare `OSError`.
+    snapshot_download(
+        repo_id,
+        local_dir=model_dir,
+        allow_patterns=list(TOKENIZER_FILES),
+        token=os.getenv("HF_TOKEN"),
+    )
+    present = {p.name for p in model_dir.iterdir()}
+    if not any(name in present for name in _REQUIRED_TOKENIZER_FILES):
+        raise ConnectionError(f"Unable to download any tokenizer files from HF for {repo_id}")
+    print(f"[fixtures] {repo_id}: resolved via Hugging Face", flush=True)
+
+
+def _populate_from_lightning_registry(repo_id: str, model_dir: Path) -> None:
+    """Downloads the registry mirror for a gated repo into `model_dir`, or skips if unavailable."""
+    fixture = HF_TO_LIGHTNING_TOKENIZER_FIXTURE.get(repo_id)
+    if fixture is None:
+        pytest.skip(
+            f"{repo_id} is gated on Hugging Face and HF_TOKEN is unavailable; "
+            "no Lightning Model Registry fixture is mapped for it."
+        )
+    try:
+        from litmodels import download_model
+    except ImportError:
+        pytest.skip(f"{repo_id} is gated and `litmodels` is not installed for the registry fallback.")
+
+    if model_dir.exists():
+        shutil.rmtree(model_dir)
+    model_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        download_model(name=fixture, download_dir=str(model_dir))
+        print(f"[fixtures] {repo_id}: resolved via Lightning Model Registry fallback ({fixture})", flush=True)
+    except Exception as ex:
+        # This path is only reached on runs without HF_TOKEN (e.g. fork PRs). A missing or
+        # unreachable mirror should skip gracefully rather than fail the job; internal/main
+        # runs have HF_TOKEN and never get here.
+        print(
+            f"[fixtures] {repo_id}: failed to resolve from Lightning Model Registry fallback ({fixture}): {ex}",
+            flush=True,
+        )
+        pytest.skip(f"Could not fetch Lightning Model Registry fixture '{fixture}' for {repo_id}: {ex}")
+
+
+def prepare_reference_tokenizer(repo_id: str, model_dir: Path) -> AutoTokenizer:
+    """Populates `model_dir` with tokenizer/config files and returns the reference HF tokenizer.
+
+    Args:
+        repo_id: The Hugging Face repo id to resolve, e.g. `EleutherAI/pythia-14m`.
+        model_dir: Directory to (re)create and populate with the resolved files.
+
+    Returns:
+        The reference `AutoTokenizer` loaded from the repo (or the registry mirror).
+    """
+    model_dir = Path(model_dir)
+    if model_dir.exists():
+        shutil.rmtree(model_dir)
+    model_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        _populate_from_hf(repo_id, model_dir)
+        return AutoTokenizer.from_pretrained(repo_id, token=os.getenv("HF_TOKEN"))
+    except Exception as ex:
+        if not _is_hf_auth_error(ex):
+            raise
+
+    # Gated repo without a usable HF_TOKEN: use the CI mirror instead.
+    _populate_from_lightning_registry(repo_id, model_dir)
+    return AutoTokenizer.from_pretrained(model_dir)
@@ -0,0 +1,55 @@
+# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
+"""Maintainer-only: publishes gated HF tokenizer/config fixtures to the Lightning Model Registry."""
+
+import argparse
+from pathlib import Path
+
+import litdata
+from _fixtures import GATED_TOKENIZER_REPOS, TOKENIZER_FILES, fixture_name, fixture_slug
+from huggingface_hub import snapshot_download
+
+
+def publish_fixture(repo_id: str, output_dir: str) -> None:
+    """Downloads `repo_id`'s tokenizer/config files and publishes them as a registry model."""
+    from litmodels import upload_model
+
+    fixture_dir = Path(output_dir) / f"{fixture_slug(repo_id)}-tokenizer"
+    snapshot_download(repo_id, local_dir=fixture_dir, allow_patterns=list(TOKENIZER_FILES))
+    upload_model(name=fixture_name(repo_id), model=str(fixture_dir), progress_bar=False, verbose=0)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "repos",
+        nargs="*",
+        default=list(GATED_TOKENIZER_REPOS),
+        help="HF repo ids to publish (default: all gated repos in the fixture map).",
+    )
+    parser.add_argument(
+        "--staging-dir",
+        default="litgpt-ci-fixtures",
+        help="Local directory to stage downloaded files before upload.",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=4,
+        help="How many repos to publish in parallel (downloads/uploads are I/O bound).",
+    )
+    args = parser.parse_args()
+
+    unknown = [r for r in args.repos if r not in GATED_TOKENIZER_REPOS]
+    if unknown:
+        parser.error(f"Not in the gated fixture map: {unknown}")
+
+    litdata.map(
+        fn=publish_fixture,
+        inputs=list(args.repos),
+        output_dir=args.staging_dir,
+        num_workers=args.workers,
+    )
+
+
+if __name__ == "__main__":
+    main()
@@ -1,15 +1,11 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
-import os
-import shutil
-import warnings
 from types import SimpleNamespace
 from unittest import mock
 
 import pytest
+from _fixtures import prepare_reference_tokenizer
 from tokenizers import Tokenizer as HFTokenizer
 from tokenizers.models import BPE
-from transformers import AutoTokenizer
-from transformers.utils import cached_file
 
 import litgpt.config as config_module
 from litgpt import PromptStyle, Tokenizer
@@ -22,28 +18,12 @@ def test_tokenizer_against_hf(config, tmp_path):
     config = config_module.Config(**config)
 
     repo_id = f"{config.hf_config['org']}/{config.hf_config['name']}"
-    theirs = AutoTokenizer.from_pretrained(repo_id, token=os.getenv("HF_TOKEN"))
-
-    # create a checkpoint directory that points to the HF files
-    hf_files = {}
-    for filename in ("tokenizer.json", "generation_config.json", "tokenizer.model", "tokenizer_config.json"):
-        try:  # download the HF tokenizer config
-            hf_file = cached_file(path_or_repo_id=repo_id, filename=filename)
-            hf_files[filename] = str(hf_file)
-        except Exception as ex:
-            warnings.warn(str(ex), RuntimeWarning)
-    if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files:
-        raise ConnectionError("Unable to download any tokenizer files from HF")
-
-    # Create a clean, model-specific subdirectory for this test run.
-    # This avoids errors if previous runs or retries left files behind, ensuring the directory is always ready for fresh downloads and comparisons.
-    model_dir = tmp_path / config.hf_config["name"]
-    if model_dir.exists():
-        shutil.rmtree(model_dir)
-    os.makedirs(model_dir, exist_ok=True)
 
-    for filename, hf_file in hf_files.items():
-        shutil.copy(hf_file, model_dir / filename)
+    # Populate a clean, model-specific subdirectory with tokenizer/config files and get the
+    # reference HF tokenizer. Falls back to the Lightning Model Registry for gated repos
+    # without HF_TOKEN (e.g. fork PRs), and skips when no mirror exists for a gated repo.
+    model_dir = tmp_path / config.hf_config["name"]
+    theirs = prepare_reference_tokenizer(repo_id, model_dir)
 
     ours = Tokenizer(model_dir)