exo-explore · team-wcv · May 7, 2026 · May 10, 2026
diff --git a/.gitignore b/.gitignore
@@ -40,4 +40,3 @@ bench/**/*.json
 tmp/models
 /build/exo
 /.claude/skills
-/.claude
diff --git a/.mlx_typings/mlx_lm/models/cache.pyi b/.mlx_typings/mlx_lm/models/cache.pyi
@@ -148,18 +148,21 @@ class QuantizedKVCache(_BaseCache):
         ...
 
 class KVCache(_BaseCache):
-    step = ...
+    step: int
+    keys: mx.array | None
+    values: mx.array | None
+    _idx: int
     def __init__(self) -> None: ...
-    def update_and_fetch(self, keys, values):  # -> tuple[array | Any, array | Any]:
-        ...
+    def update_and_fetch(
+        self, keys: mx.array, values: mx.array
+    ) -> tuple[mx.array, mx.array]: ...
     @property
     def state(
         self,
     ) -> tuple[mx.array | None, mx.array | None]: ...
     @state.setter
-    def state(self, v) -> None: ...
-    def is_trimmable(self):  # -> Literal[True]:
-        ...
+    def state(self, v: tuple[mx.array | None, mx.array | None]) -> None: ...
+    def is_trimmable(self) -> bool: ...
     def trim(self, n: int) -> int: ...
     def to_quantized(
         self, group_size: int = ..., bits: int = ...
@@ -169,20 +172,19 @@ class KVCache(_BaseCache):
     ) -> mx.array | Literal["causal"] | None: ...
 
 class RotatingKVCache(_BaseCache):
-    step = ...
+    step: int
     keys: mx.array | None
     values: mx.array | None
     keep: int
     max_size: int
     _idx: int
-    def __init__(self, max_size, keep=...) -> None: ...
+    def __init__(self, max_size: int, keep: int = ...) -> None: ...
     def _trim(
         self, trim_size: int, v: mx.array, append: mx.array | None = ...
     ) -> mx.array: ...
     def update_and_fetch(
-        self, keys, values
-    ):  # -> tuple[array | Any, array | Any] | tuple[array | Any, array | Any | None]:
-        ...
+        self, keys: mx.array, values: mx.array
+    ) -> tuple[mx.array, mx.array]: ...
     @property
     def state(
         self,

diff --git a/.mlx_typings/mlx_lm/models/gemma4_text.pyi b/.mlx_typings/mlx_lm/models/gemma4_text.pyi
@@ -10,37 +10,37 @@ from .switch_layers import SwitchGLU
 
 @dataclass
 class ModelArgs(BaseModelArgs):
-    model_type: str
-    hidden_size: int
-    num_hidden_layers: int
-    intermediate_size: int
-    num_attention_heads: int
-    head_dim: int
-    global_head_dim: int
-    global_partial_rotary_factor: float
-    rms_norm_eps: float
-    vocab_size: int
-    vocab_size_per_layer_input: int
-    num_key_value_heads: int
-    num_global_key_value_heads: Optional[int]
-    num_kv_shared_layers: int
-    pad_token_id: int
-    hidden_size_per_layer_input: int
-    rope_traditional: bool
-    partial_rotary_factor: float
-    rope_parameters: Optional[Dict[str, Any]]
-    sliding_window: int
-    sliding_window_pattern: int
-    max_position_embeddings: int
-    attention_k_eq_v: bool
-    final_logit_softcapping: float
-    use_double_wide_mlp: bool
-    enable_moe_block: bool
-    num_experts: Optional[int]
-    top_k_experts: Optional[int]
-    moe_intermediate_size: Optional[int]
-    layer_types: Optional[List[str]]
-    tie_word_embeddings: bool
+    model_type: str = ...
+    hidden_size: int = ...
+    num_hidden_layers: int = ...
+    intermediate_size: int = ...
+    num_attention_heads: int = ...
+    head_dim: int = ...
+    global_head_dim: int = ...
+    global_partial_rotary_factor: float = ...
+    rms_norm_eps: float = ...
+    vocab_size: int = ...
+    vocab_size_per_layer_input: int = ...
+    num_key_value_heads: int = ...
+    num_global_key_value_heads: Optional[int] = ...
+    num_kv_shared_layers: int = ...
+    pad_token_id: int = ...
+    hidden_size_per_layer_input: int = ...
+    rope_traditional: bool = ...
+    partial_rotary_factor: float = ...
+    rope_parameters: Optional[Dict[str, Any]] = ...
+    sliding_window: int = ...
+    sliding_window_pattern: int = ...
+    max_position_embeddings: int = ...
+    attention_k_eq_v: bool = ...
+    final_logit_softcapping: float = ...
+    use_double_wide_mlp: bool = ...
+    enable_moe_block: bool = ...
+    num_experts: Optional[int] = ...
+    top_k_experts: Optional[int] = ...
+    moe_intermediate_size: Optional[int] = ...
+    layer_types: Optional[List[str]] = ...
+    tie_word_embeddings: bool = ...
 
     def __post_init__(self) -> None: ...
 

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/bench/eval_tool_calls.py b/bench/eval_tool_calls.py
@@ -15,8 +15,9 @@
 from typing import Any, Literal
 
 import httpx
-from exo_tools.client import ExoClient, ExoHttpError
-from exo_tools.harness import (
+from harness import (
+    ExoClient,
+    ExoHttpError,
     add_common_instance_args,
     capture_cluster_snapshot,
     instance_id_from_instance,

diff --git a/bench/exo_bench.py b/bench/exo_bench.py
@@ -30,8 +30,9 @@
 from statistics import mean
 from typing import Any
 
-from exo_tools.client import ExoClient, ExoHttpError
-from exo_tools.harness import (
+from harness import (
+    ExoClient,
+    ExoHttpError,
     add_common_instance_args,
     capture_cluster_snapshot,
     find_existing_instance,

diff --git a/bench/exo_eval.py b/bench/exo_eval.py
@@ -42,8 +42,9 @@
 from typing import Any
 
 import httpx
-from exo_tools.client import ExoClient, ExoHttpError
-from exo_tools.harness import (
+from harness import (
+    ExoClient,
+    ExoHttpError,
     add_common_instance_args,
     capture_cluster_snapshot,
     find_existing_instance,