From cac59e3caa1f7adcde43ef07204588846e700ed9 Mon Sep 17 00:00:00 2001 From: Muddyblack Date: Wed, 20 May 2026 16:46:57 +0200 Subject: [PATCH 1/8] init multiserver plugin --- docs/plugins.md | 1 + docs/plugins/multiserver.md | 268 ++++++ netsim/extra/multiserver/defaults.yml | 47 + netsim/extra/multiserver/plugin.py | 842 ++++++++++++++++++ tests/topology/expected/multiserver-auto.yml | 358 ++++++++ .../expected/multiserver-explicit.yml | 382 ++++++++ tests/topology/input/multiserver-auto.yml | 45 + tests/topology/input/multiserver-explicit.yml | 54 ++ 8 files changed, 1997 insertions(+) create mode 100644 docs/plugins/multiserver.md create mode 100644 netsim/extra/multiserver/defaults.yml create mode 100644 netsim/extra/multiserver/plugin.py create mode 100644 tests/topology/expected/multiserver-auto.yml create mode 100644 tests/topology/expected/multiserver-explicit.yml create mode 100644 tests/topology/input/multiserver-auto.yml create mode 100644 tests/topology/input/multiserver-explicit.yml diff --git a/docs/plugins.md b/docs/plugins.md index ac3e94ae64..5c3e917dac 100644 --- a/docs/plugins.md +++ b/docs/plugins.md @@ -19,6 +19,7 @@ plugins/kind.md plugins/mlag.vtep.md plugins/multilab.md + plugins/multiserver.md plugins/node.clone.md plugins/ospf.areas.md plugins/vrrp.version.md diff --git a/docs/plugins/multiserver.md b/docs/plugins/multiserver.md new file mode 100644 index 0000000000..8503ae16f0 --- /dev/null +++ b/docs/plugins/multiserver.md @@ -0,0 +1,268 @@ +(plugin-multiserver)= +# Splitting Topologies Across Multiple Servers + +The *multiserver* plugin distributes a single *netlab* topology across multiple physical servers. It assigns nodes to servers, classifies links as local or cross-server, and generates a self-contained containerlab configuration directory for each server with VXLAN-based interconnects. + +```eval_rst +.. contents:: Table of Contents + :depth: 2 + :local: + :backlinks: none +``` + +```{warning} +* The *multiserver* plugin requires the **containerlab** provider on all servers. +* Containerlab version >= `0.46` is required for native VXLAN link endpoint support. +* All physical servers must have direct IP reachability (e.g. over a management network or dedicated interconnect). +``` + +## Using the Plugin + +* Add `plugin: [ multiserver ]` to lab topology. +* Define target servers in the **multiserver.servers** list. +* Choose an assignment mode (`explicit` or `auto`) with **multiserver.assignment**. + +The plugin runs during `netlab create` and generates self-contained per-server directories (e.g. `server-1/`, `server-2/`) with tailored `clab.yml` files, node configs, and VXLAN scripts ready for deployment. + +## Configuring Plugin Parameters + +The plugin is configured with the **multiserver** topology-level dictionary that has these parameters: + +| Parameter | Type | Meaning | +|-----------|------|---------| +| **assignment** | string | How to assign nodes to servers: `explicit` (default) or `auto` | +| **servers** | list | List of target physical servers | +| **vxlan** | dictionary | Global settings for VXLAN tunnels | +| **replicate** | list | Nodes or groups that must be duplicated on all servers | +| **output_dir** | string | Template for per-server directory names (default: `server-{server_id}`) | + +(multiserver-servers)= +### Server Parameters + +Each entry in the **multiserver.servers** list supports these parameters: + +| Parameter | Type | Meaning | +|-----------|------|---------| +| **id** | integer | Unique identifier for the server (e.g. `1`, `2`) | +| **host** | string | IP address or hostname of the remote server | +| **groups** | list | *netlab* groups whose members are assigned to this server | +| **members** | list | Individual node names assigned to this server | +| **vxlan_dev** | string | Physical interface to bind VXLAN tunnels to on this server | + +(multiserver-vxlan)= +### VXLAN Parameters + +Global VXLAN settings are specified in the **multiserver.vxlan** dictionary: + +| Parameter | Type | Meaning | +|-----------|------|---------| +| **vni_base** | integer | Starting VNI for cross-server links (default: `10000`) | +| **dstport** | integer | UDP destination port for VXLAN traffic (default: `4789`) | +| **dev** | string | Default physical interface to bind VXLAN tunnels (default: `ens33`) | + +(multiserver-assignment)= +## Assignment Modes + +### Explicit Assignment (Default) + +In `explicit` mode, every node must be mapped to a server using the **groups** or **members** attributes of a [server entry](multiserver-servers). Any unassigned node (excluding [replicated nodes](multiserver-replicate)) results in an error. + +```yaml +plugin: [ multiserver ] + +multiserver: + assignment: explicit + servers: + - id: 1 + host: 192.168.168.128 + groups: [ core ] + members: [ edge-node ] + - id: 2 + host: 192.168.168.129 + groups: [ spines, leaves ] +``` + +### Automatic Assignment + +In `auto` mode, nodes that are not explicitly pinned to a server are distributed automatically using a greedy balancing algorithm: + +1. Nodes belonging to a *netlab* group are kept together — the entire group is placed on the server that currently has the fewest nodes. Larger groups are placed first for better balance. +2. Remaining ungrouped nodes are assigned one at a time to the least-loaded server. + +Nodes already pinned via **groups** or **members** attributes count toward server load, so the algorithm balances around any explicit assignments. + +```yaml +plugin: [ multiserver ] + +multiserver: + assignment: auto + servers: + - id: 1 + host: 192.168.168.128 + - id: 2 + host: 192.168.168.129 +``` + +```{tip} +You can pin specific nodes or groups to a server in `auto` mode using **groups** and **members** attributes. Only unassigned nodes are auto-distributed. +``` + +#### Group Granularity + +Because auto mode keeps entire groups together on a single server, the granularity of your groups directly affects how evenly nodes are distributed. Define groups at the smallest unit you want to keep on one server. + +For example, consider a topology with two sites, each containing five nodes: + +```yaml +# BAD: one large group — all 10 nodes land on one server +groups: + sites: + members: [ site1-r1, site1-r2, site1-r3, site1-r4, site1-r5, + site2-r1, site2-r2, site2-r3, site2-r4, site2-r5 ] +``` + +```yaml +# GOOD: per-site groups — one site per server +groups: + site1: + members: [ site1-r1, site1-r2, site1-r3, site1-r4, site1-r5 ] + site2: + members: [ site2-r1, site2-r2, site2-r3, site2-r4, site2-r5 ] + sites: + members: [ site1-r1, site1-r2, site1-r3, site1-r4, site1-r5, + site2-r1, site2-r2, site2-r3, site2-r4, site2-r5 ] +``` + +In the second example the parent `sites` group can still be used for Ansible targeting or shared configuration — it does not affect placement because the child groups (`site1`, `site2`) claim their members first during assignment. + +```{note} +Groups are processed in definition order. Child groups defined **before** a parent group will claim their members first, making the parent group a no-op for assignment. Always define fine-grained groups before aggregate groups in your topology. +``` + +(multiserver-replicate)= +### Replicated Nodes + +Nodes listed in **multiserver.replicate** are instantiated on every server. This is useful for infrastructure services that need local access on each physical host — for example, monitoring collectors, route reflectors, or DNS resolvers. + +Links connecting to replicated nodes are always treated as local, so traffic between a replicated node and its neighbors never crosses the VXLAN overlay. + +```yaml +multiserver: + assignment: auto + servers: + - id: 1 + host: 192.168.168.128 + - id: 2 + host: 192.168.168.129 + replicate: [ prometheus, grafana ] +``` + +## Complete Example + +A minimal two-server topology with explicit assignment: + +```yaml +plugin: [ multiserver ] + +provider: clab + +groups: + spines: + members: [ s1, s2 ] + leaves: + members: [ l1, l2 ] + +nodes: + s1: + device: srlinux + s2: + device: srlinux + l1: + device: srlinux + l2: + device: srlinux + +links: + - s1-l1 + - s1-l2 + - s2-l1 + - s2-l2 + +multiserver: + assignment: explicit + servers: + - id: 1 + host: 192.168.168.128 + groups: [ spines ] + - id: 2 + host: 192.168.168.129 + groups: [ leaves ] + vxlan: + vni_base: 10000 + dev: ens33 +``` + +This places spines on server 1 and leaves on server 2. All four links cross servers and are provisioned as containerlab native VXLAN endpoints. + +## Behind the Scenes + +When the plugin processes the topology, it classifies links into three categories: + +* **Local links** connecting nodes on the same server remain as regular containerlab veth pairs or bridges. +* **Cross-server point-to-point links** are provisioned via containerlab's native VXLAN link endpoints (`type: vxlan` in `clab.yml`). +* **Cross-server multi-access links** use a local Linux bridge on each server, interconnected via host-level VXLAN tunnels configured by generated setup scripts. + +Each per-server directory is self-contained and includes: + +* A tailored `clab.yml` with only the relevant nodes and cross-server VXLAN interfaces +* A filtered `netlab.snapshot.pickle` for use with `netlab up --snapshot` +* Copies of `node_files/`, `host_vars/`, and Ansible config for only the nodes on that server +* `vxlan-setup.sh` and `vxlan-teardown.sh` scripts (when multi-access VXLAN tunnels are needed) + +(multiserver-deployment)= +## Deployment Workflow + +**Step 1: Generate configurations** on your workstation: + +```bash +netlab create topology.yml +``` + +The plugin automatically copies all required files into each server directory — no extra bundling step is needed. + +**Step 2: Copy server directories to remote hosts** (e.g. via rsync): + +```bash +rsync -avz server-1/ user@192.168.168.128:~/lab/server-1/ +rsync -avz server-2/ user@192.168.168.129:~/lab/server-2/ +``` + +**Step 3: Deploy on each server** by running the following on each remote host: + +```bash +sudo netlab up --snapshot -vv +sudo ./vxlan-setup.sh # only if multi-access VXLAN tunnels are present +``` + +```{important} +**Why is `--snapshot` required on remote servers?** + +You must run `sudo netlab up --snapshot` on remote servers to load the topology from the pre-generated snapshot (`netlab.snapshot.pickle`) instead of the original `topology.yml`. + +Running with `topology.yml` directly on remote servers will fail because: +1. **Consistency**: Netlab dynamically allocates IP addresses, interface IDs, and VXLAN VNIs. Independent creation runs on different hosts would result in mismatched allocations. +2. **Recursion**: Running `netlab create` on `topology.yml` on the remote hosts would execute the `multiserver` plugin again, causing it to split the topology recursively and generate nested server subdirectories. +``` + +**Teardown** in reverse order: + +```bash +sudo ./vxlan-teardown.sh +sudo clab destroy -t clab.yml +``` + +## Limitations + +* Only the **containerlab** provider is supported. Libvirt and virtualbox topologies cannot be split across servers. +* Cross-server VXLAN tunnels use a flat VNI space starting at **vni_base**. The maximum VNI value is 16777215 (24-bit). Topologies with more than ~16 million cross-server links will fail validation. +* All physical servers must have direct IP reachability — the plugin does not support NAT traversal or relay hosts between servers. diff --git a/netsim/extra/multiserver/defaults.yml b/netsim/extra/multiserver/defaults.yml new file mode 100644 index 0000000000..bd68aaa3b4 --- /dev/null +++ b/netsim/extra/multiserver/defaults.yml @@ -0,0 +1,47 @@ +# multiserver plugin: split a topology across multiple physical servers +# +# Requires containerlab >= 0.46 (VXLAN link support). +# +# Cross-server links become: +# - P2P links -> containerlab native VXLAN endpoints (self-contained in clab.yml) +# - Bridge links -> local bridge + host VXLAN tunnel (via generated setup script) +# +# Assignment modes: +# - explicit: user must assign every node via servers[].groups or .members +# - auto: unassigned nodes distributed across servers, keeping netlab groups together +# +--- +attributes: + global: + multiserver: + servers: + type: list + _subtype: + id: + type: int + _required: True + host: + type: str + _required: True + groups: list + members: list + vxlan_dev: str + vxlan: + vni_base: int + dstport: int + dev: str + assignment: + type: str + valid_values: [explicit, auto] + replicate: list + output_dir: + type: str + +multiserver: + vxlan: + vni_base: 10000 + dstport: 4789 + dev: ens33 + assignment: explicit + replicate: [] + output_dir: "server-{server_id}" diff --git a/netsim/extra/multiserver/plugin.py b/netsim/extra/multiserver/plugin.py new file mode 100644 index 0000000000..888e39d96c --- /dev/null +++ b/netsim/extra/multiserver/plugin.py @@ -0,0 +1,842 @@ +""" +multiserver plugin — split a netlab topology across multiple physical servers. + +Generates per-server containerlab topology files with cross-server VXLAN links. +Requires containerlab >= 0.46 for native VXLAN link support. + +Cross-server links: + + * P2P links (2 endpoints) → containerlab native VXLAN (type: vxlan in clab.yml) + * Multi-access links (3+ endpoints, bridge) → local bridge + host-level VXLAN tunnel + created by a generated vxlan-setup.sh script + +Server assignment modes: + + * explicit (default) — user assigns nodes via groups/members, unassigned nodes cause + an error. Best when you need precise control over placement. + * auto — unassigned nodes are distributed round-robin across servers. Use this for + automatic splitting: just define the servers and let the plugin balance the nodes. + +Group granularity (auto mode): + + Auto mode keeps entire netlab groups together on one server. Define groups at + the smallest unit you want to keep on a single server. Parent/aggregate groups + are fine — child groups defined first will claim their members before the parent + is reached. See docs/plugins/multiserver.md for details and examples. + +Explicit assignment example: + + plugin: [ multiserver ] + + multiserver: + servers: + - id: 1 + host: 192.168.168.128 + groups: [ hubs ] + members: [ extra-node ] + - id: 2 + host: 10.0.0.67 + groups: [ spines, leaves ] + assignment: explicit + +Automatic splitting example (no groups/members needed): + + plugin: [ multiserver ] + + multiserver: + servers: + - id: 1 + host: 192.168.168.128 + - id: 2 + host: 10.0.0.67 + assignment: auto + replicate: [ prometheus, grafana ] +""" + +import os +import pickle +import shutil +from pathlib import Path + +import yaml +from box import Box +from packaging import version as _pv + +from netsim.data import append_to_list +from netsim.utils import log + +_execute_after = ["fabric", "node.clone"] + +# --------------------------------------------------------------------------- +# Hook: init — validate config + register output hook +# --------------------------------------------------------------------------- + + +def init(topology: Box) -> None: + ms = topology.get("multiserver", None) + if not ms: + return + + # Merge plugin defaults with user config (user values take priority) + defaults = topology.defaults.get("multiserver", Box({})) + topology.multiserver = defaults + ms + + ms = topology.multiserver + servers = ms.get("servers", []) + + # Currently only containerlab is supported — generating per-server Vagrantfiles + # for libvirt/virtualbox would require reimplementing the Vagrant Ruby DSL + provider = topology.get("provider", "") or topology.defaults.get("provider", "") + if provider and provider != "clab": + log.error( + f'multiserver plugin currently supports only the "clab" provider, not "{provider}"', + log.IncorrectValue, + "multiserver", + more_hints=["libvirt and virtualbox support may be added in a future release"], + ) + return + + # Cross-server P2P links use containerlab native VXLAN endpoints (type: vxlan), + # available since containerlab 0.46. netlab already requires >= 0.75 so this + # should always pass, but check explicitly in case the requirement is relaxed. + clab_min = "0.46.0" + clab_ver = str(topology.defaults.providers.clab.get("version", "0.0.0")) + if _pv.Version(clab_ver) < _pv.Version(clab_min): + log.error( + f"multiserver plugin requires containerlab >= {clab_min} for VXLAN links (netlab targets {clab_ver})", + log.IncorrectValue, + "multiserver", + ) + return + + if not servers: + log.error('multiserver plugin requires a "servers" list', log.MissingValue, "multiserver") + return + + if len(servers) < 2: + log.error("multiserver plugin requires at least 2 servers", log.IncorrectValue, "multiserver") + return + + seen_ids: set = set() + for idx, s in enumerate(servers): + if "id" not in s: + log.error(f'Server entry #{idx + 1} missing required "id" field', log.MissingValue, "multiserver") + continue + if "host" not in s: + log.error(f'Server {s.id} missing required "host" field', log.MissingValue, "multiserver") + continue + if s.id in seen_ids: + log.error(f"Duplicate server id {s.id}", log.IncorrectValue, "multiserver") + seen_ids.add(s.id) + + log.exit_on_error() + + # Register the output hook so netlab create calls our output() function + append_to_list(topology.defaults.netlab.create, "plugin", "multiserver") + + +# --------------------------------------------------------------------------- +# Hook: post_transform — resolve server assignments, classify links +# --------------------------------------------------------------------------- + + +def post_transform(topology: Box) -> None: + ms = topology.get("multiserver", None) + if not ms: + return + + servers = ms.servers + server_map = {s.id: s for s in servers} + assignment: dict = {} # node_name -> server_id + # --- Resolve replicated nodes (present on every server) --- + replicated: set = set() + for entry in ms.get("replicate", []): + if entry in topology.nodes: + replicated.add(entry) + elif entry in topology.get("groups", {}): + for member in topology.groups[entry].get("members", []): + replicated.add(member) + else: + log.error(f'multiserver.replicate: "{entry}" is not a node or group', log.IncorrectValue, "multiserver") + + # --- Resolve assignments from server groups + members --- + for server in servers: + for gname in server.get("groups", []): + grp = topology.get("groups", {}).get(gname, None) + if grp is None: + log.error(f'Server {server.id} references unknown group "{gname}"', log.IncorrectValue, "multiserver") + continue + for member in grp.get("members", []): + if member in assignment and assignment[member] != server.id: + log.error( + f"Node {member} assigned to both server {assignment[member]} and {server.id}", + log.IncorrectValue, + "multiserver", + ) + assignment[member] = server.id + + for member in server.get("members", []): + if member not in topology.nodes: + log.error(f'Server {server.id} references unknown node "{member}"', log.IncorrectValue, "multiserver") + continue + if member in assignment and assignment[member] != server.id: + log.error( + f"Node {member} assigned to both server {assignment[member]} and {server.id}", + log.IncorrectValue, + "multiserver", + ) + assignment[member] = server.id + + # --- Handle unassigned nodes (replicated nodes are exempt) --- + unassigned = set(n for n in topology.nodes if n not in assignment and n not in replicated) + + mode = ms.get("assignment", "explicit") + if unassigned: + if mode == "explicit": + log.error( + f"Nodes not assigned to any server: {', '.join(sorted(unassigned))}", + log.MissingValue, + "multiserver", + more_hints=[ + "Assign nodes via multiserver.servers[].groups or .members", + "Or set multiserver.assignment: auto for round-robin distribution", + ], + ) + else: + sorted_sids = sorted(server_map.keys()) + + # Distribute by netlab group to keep related nodes on the same server. + # Groups are assigned round-robin by size (largest first) for balance. + # Ungrouped nodes are distributed individually at the end. + group_buckets: list = [] # [(group_name, [members])] + claimed = set() + for gname, gdata in topology.get("groups", {}).items(): + members = [m for m in gdata.get("members", []) if m in unassigned and m not in claimed] + if members: + group_buckets.append((gname, members)) + claimed.update(members) + + # Sort groups largest-first for better balance + group_buckets.sort(key=lambda g: -len(g[1])) + + # Track node counts per server for balanced distribution + counts = {sid: sum(1 for s in assignment.values() if s == sid) for sid in sorted_sids} + + for gname, members in group_buckets: + # Assign entire group to the server with the fewest nodes + target = min(sorted_sids, key=lambda s: counts[s]) + for m in members: + assignment[m] = target + counts[target] += len(members) + + # Remaining ungrouped nodes: round-robin to least-loaded server + ungrouped = sorted(unassigned - claimed) + for name in ungrouped: + target = min(sorted_sids, key=lambda s: counts[s]) + assignment[name] = target + counts[target] += 1 + + log.exit_on_error() + + # --- Classify links: local vs cross-server --- + vni_base = ms.vxlan.get("vni_base", 10000) + vni = vni_base + cross_count = 0 + + for link in topology.links: + link_servers = set() + for intf in link.get("interfaces", []): + if intf.node in replicated: + continue + sid = assignment.get(intf.node) + if sid is not None: + link_servers.add(sid) + + if len(link_servers) > 1: + link._ms = Box({"cross": True, "vni": vni, "servers": sorted(link_servers)}) + vni += 1 + cross_count += 1 + else: + link._ms = Box( + { + "cross": False, + "servers": sorted(link_servers), + } + ) + + if vni > 16777215: + log.error(f"VXLAN VNI overflow: {vni} exceeds 24-bit maximum (16777215)", log.IncorrectValue, "multiserver") + + log.exit_on_error() + + # Store state for output hook + topology._multiserver = Box( + { + "assignment": assignment, + "server_map": server_map, + "replicated": sorted(replicated), + } + ) + + # Summary — show which groups and nodes landed on each server + for server in servers: + sid = server.id + server_nodes = sorted(n for n, s in assignment.items() if s == sid) + + # Figure out which netlab groups are fully on this server + server_groups = [] + for gname, gdata in topology.get("groups", {}).items(): + members = gdata.get("members", []) + if not members: + continue + on_this = [m for m in members if assignment.get(m) == sid] + if on_this and len(on_this) == len([m for m in members if m in assignment]): + server_groups.append(gname) + + n = len(server_nodes) + log.info(f"Server {sid} ({server.host}): {n} nodes", module="multiserver") + if server_groups: + preview = server_groups[:8] + suffix = f" ... +{len(server_groups) - 8} more" if len(server_groups) > 8 else "" + log.info(f" groups: {', '.join(preview)}{suffix}", module="multiserver") + if n <= 20: + log.info(f" nodes: {', '.join(server_nodes)}", module="multiserver") + else: + preview = server_nodes[:6] + log.info(f" nodes: {', '.join(preview)} ... +{n - 6} more", module="multiserver") + + if replicated: + log.info(f"Replicated on all servers: {', '.join(sorted(replicated))}", module="multiserver") + if cross_count: + log.info(f"{cross_count} cross-server links (VNI {vni_base}–{vni - 1})", module="multiserver") + + +# --------------------------------------------------------------------------- +# Hook: output — generate per-server clab.yml + VXLAN scripts +# --------------------------------------------------------------------------- + + +def output(topology: Box) -> None: + ms = topology.get("multiserver", None) + ms_data = topology.get("_multiserver", None) + if not ms or not ms_data: + return + + assignment = ms_data.assignment + server_map = ms_data.server_map + vxlan_cfg = ms.vxlan + out_tpl = ms.get("output_dir", "server-{server_id}") + + replicated = set(ms_data.get("replicated", [])) + server_folders = [] + + for server in ms.servers: + sid = server.id + local_nodes = {n for n, s in assignment.items() if s == sid} | replicated + if not local_nodes: + continue + + out_dir = out_tpl.format(name=topology.name, server_id=sid) + server_folders.append((out_dir, local_nodes)) + + if Path(out_dir).exists(): + shutil.rmtree(out_dir) + Path(out_dir).mkdir(parents=True, exist_ok=True) + + clab_dict, vxlan_tunnels = _build_server_clab(topology, local_nodes, sid, server_map, vxlan_cfg) + + # Write clab.yml + with open(Path(out_dir) / "clab.yml", "w") as f: + yaml.dump(clab_dict, f, default_flow_style=False, sort_keys=False, indent=2) + + # Write filtered snapshot so 'netlab up --snapshot' works per-server + _write_server_snapshot(topology, local_nodes, out_dir) + + # Generate VXLAN setup/teardown scripts for bridge tunnels + if vxlan_tunnels: + dev = server.get("vxlan_dev", "") or vxlan_cfg.get("dev", "") + if not dev: + log.error( + f"Server {sid} has multi-access cross-server links but no VXLAN device is configured", + log.MissingValue, + "multiserver", + more_hints=["Set multiserver.vxlan.dev or multiserver.servers[].vxlan_dev"], + ) + continue + _write_vxlan_scripts(out_dir, vxlan_tunnels, dev) + + link_count = len(clab_dict.get("topology", {}).get("links", [])) + vx_count = len(vxlan_tunnels) + parts = [f"{len(local_nodes)} nodes", f"{link_count} links"] + if vx_count: + parts.append(f"{vx_count} VXLAN tunnels") + log.info(f"Server {sid}: {out_dir}/ — {', '.join(parts)}", module="multiserver") + + # Register atexit handler to copy node_files, host_vars, etc. into each server + # folder after netlab writes all output files. + if server_folders: + import atexit + atexit.register(_distribute_files_atexit, os.getcwd(), server_folders) + + +def _distribute_files_atexit(lab_folder: str, server_folders: list) -> None: + """Distribute generated files (node_files, host_vars, ansible.cfg, hosts.yml) + to each server folder. Registered via atexit so it runs AFTER netlab has + written all output files. + """ + lab_path = Path(lab_folder) + nf_dir = lab_path / "node_files" + hv_dir = lab_path / "host_vars" + + for sf, local_nodes in server_folders: + sf_path = Path(sf) + if not sf_path.is_dir(): + continue + + # node_files: per-node dirs + shared files (names starting with -) + if nf_dir.is_dir(): + dst_nf = sf_path / "node_files" + dst_nf.mkdir(exist_ok=True) + for item in nf_dir.iterdir(): + if item.name in local_nodes or item.name.startswith("-"): + dst = dst_nf / item.name + if not dst.exists(): + try: + if item.is_dir(): + shutil.copytree(item, dst) + else: + shutil.copy2(item, dst) + except Exception: + pass + + # host_vars: per-node only + if hv_dir.is_dir(): + dst_hv = sf_path / "host_vars" + dst_hv.mkdir(exist_ok=True) + for item in hv_dir.iterdir(): + if item.name in local_nodes: + dst = dst_hv / item.name + if not dst.exists(): + try: + if item.is_dir(): + shutil.copytree(item, dst) + else: + shutil.copy2(item, dst) + except Exception: + pass + + # Copy all other subdirectories (e.g. group_vars, templates, monitoring) + # excluding server folders, node_files, host_vars, and python/git metadata. + server_names = {Path(sf).name for sf, _ in server_folders} + for item in lab_path.iterdir(): + if item.is_dir(): + if item.name in server_names or item.name in ("node_files", "host_vars", "__pycache__", ".git"): + continue + + # Optimization: only copy grafana directory if this server hosts the grafana node + if item.name == "grafana" and "grafana" not in local_nodes: + continue + + dst_dir = sf_path / item.name + if not dst_dir.exists(): + try: + shutil.copytree(item, dst_dir) + except Exception: + pass + + # Ansible inventory and config + for fname in ("ansible.cfg", "hosts.yml"): + src = lab_path / fname + dst = sf_path / fname + if src.exists() and not dst.exists(): + try: + shutil.copy2(src, dst) + except Exception: + pass + + +# =========================================================================== +# Internal helpers +# =========================================================================== + + +def _to_plain(obj: object) -> object: + """Convert Box/BoxList to plain dict/list for clean YAML serialization.""" + if isinstance(obj, Box): + return {k: _to_plain(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_to_plain(v) for v in obj] + return obj + + +def _intf_clab_name(intf: Box) -> str: + """Containerlab interface name for a node interface.""" + return intf.get("clab", {}).get("name", "") or intf.get("ifname", "") + + +def _build_clab_node(nname: str, ndata: Box, topology: Box) -> dict: + """Reconstruct a clab.yml node entry from the transformed topology data.""" + entry: dict = {} + clab = ndata.get("clab", Box({})) + + # Management IPs + nm = clab.get("network-mode", "") + if nm != "none": + if ndata.get("mgmt", {}).get("ipv4"): + entry["mgmt-ipv4"] = str(ndata.mgmt.ipv4) + if ndata.get("mgmt", {}).get("ipv6"): + entry["mgmt-ipv6"] = str(ndata.mgmt.ipv6) + + kind = clab.get("kind", "") or ndata.get("device", "") + entry["kind"] = kind + if kind == "linux" and "restart-policy" not in clab: + entry["restart-policy"] = "no" + + # Pass through standard clab node attributes + special = set(topology.defaults.providers.clab.get("node_config_special", [])) + for attr in topology.defaults.providers.clab.get("attributes", {}).get("node", {}).get("_keys", []): + if attr in clab and attr not in special: + entry[attr] = _to_plain(clab[attr]) + + # srl-agents goes under extras: (matches clab.j2 template) + if "srl-agents" in clab: + entry["extras"] = {"srl-agents": _to_plain(clab["srl-agents"])} + + entry["image"] = str(clab.get("image", "") or ndata.get("box", "")) + entry["runtime"] = str(clab.get("runtime", "") or topology.defaults.providers.clab.get("runtime", "docker")) + + # Groups + if "groups" in topology: + groups = [g for g in topology.groups if nname in topology.groups[g].get("members", [])] + if groups: + entry["group"] = ",".join(groups) + + # Binds — keep paths as-is (relative to the server directory). + # The distribute script copies node_files/ into each server dir, + # so paths like node_files/r1/... work when running from there. + if "binds" in clab: + entry["binds"] = [] + for b in clab.binds: + bind_str = f"{b.source}:{b.target}" + if "mode" in b: + bind_str += f":{b.mode}" + entry["binds"].append(bind_str) + + # Startup config + if "startup-config" in clab: + entry["startup-config"] = str(clab["startup-config"]) + + return entry + + +def _build_server_clab(topology: Box, local_nodes: set, sid: int, server_map: dict, vxlan_cfg: Box) -> tuple: + """Build the clab.yml dict and VXLAN tunnel list for one server.""" + dstport = vxlan_cfg.get("dstport", 4789) + multilab_id = topology.defaults.get("multilab", {}).get("id", 0) + assignment = topology._multiserver.assignment + + clab: dict = { + "name": topology.name, + "prefix": str(topology.defaults.providers.clab.get("lab_prefix", "") or ""), + "mgmt": { + "network": str(topology.addressing.mgmt.get("_network", "") or "netlab_mgmt"), + "ipv4-subnet": str(topology.addressing.mgmt.get("ipv4", "172.20.20.0/24")), + }, + "topology": { + "nodes": {}, + "links": [], + }, + } + + mgmt_bridge = topology.addressing.mgmt.get("_bridge", "") + if mgmt_bridge: + clab["mgmt"]["bridge"] = str(mgmt_bridge) + if topology.defaults.addressing.mgmt.get("ipv6"): + clab["mgmt"]["ipv6-subnet"] = str(topology.defaults.addressing.mgmt.ipv6) + + # --- Nodes --- + for nname, ndata in topology.nodes.items(): + if ndata.get("unmanaged", False): + continue + if nname in local_nodes: + clab["topology"]["nodes"][nname] = _build_clab_node(nname, ndata, topology) + + # --- Links --- + bridges_needed: set = set() + vxlan_tunnels: list = [] + + for link in topology.links: + local_intfs = [i for i in link.get("interfaces", []) if i.node in local_nodes] + if not local_intfs: + continue + + is_cross = link.get("_ms", {}).get("cross", False) + node_count = link.get("node_count", len(link.get("interfaces", []))) + + # ---- Uplink (macvlan) ---- + if link.get("clab", {}).get("uplink", False): + for intf in local_intfs: + clab_name = _intf_clab_name(intf) + clab["topology"]["links"].append({"endpoints": [f"{intf.node}:{clab_name}", f"macvlan:{link.clab.uplink}"]}) + continue + + # ---- Fully local link ---- + if not is_cross: + _render_local_link(clab, link, local_intfs, node_count, bridges_needed, multilab_id, topology) + continue + + # ---- Cross-server P2P (clab native VXLAN) ---- + if node_count == 2: + _render_p2p_vxlan(clab, link, sid, server_map, local_intfs, assignment, dstport) + continue + + # ---- Cross-server multi-access (bridge + host VXLAN) ---- + _render_bridge_vxlan( + clab, + link, + sid, + server_map, + local_intfs, + assignment, + bridges_needed, + vxlan_tunnels, + dstport, + multilab_id, + topology, + ) + + # --- Bridge nodes --- + bridge_type = str(topology.defaults.providers.clab.get("bridge_type", "bridge")) + for brname in sorted(bridges_needed): + clab["topology"]["nodes"][brname] = {"kind": bridge_type} + + if not clab["topology"]["links"]: + del clab["topology"]["links"] + + return clab, vxlan_tunnels + + +def _render_local_link( + clab: dict, link: Box, local_intfs: list, node_count: int, bridges_needed: set, multilab_id: int, topology: Box +) -> None: + """Render a fully-local link (all endpoints on the same server).""" + + # Stub link + if node_count == 1 and local_intfs: + intf = local_intfs[0] + clab["topology"]["links"].append( + { + "type": "dummy", + "endpoint": {"node": intf.node, "interface": _intf_clab_name(intf)}, + } + ) + return + + # P2P link + if node_count == 2: + endpoints = [f"{i.node}:{_intf_clab_name(i)}" for i in local_intfs] + if len(endpoints) == 2: + clab["topology"]["links"].append({"endpoints": endpoints}) + return + + # Multi-access link (bridge) + if node_count > 2 and link.get("bridge"): + bridge = link.bridge + if not link.get("clab", {}).get("external_bridge", False): + bridges_needed.add(bridge) + for intf in local_intfs: + ndata = topology.nodes[intf.node] + bridge_intf = f"bni{multilab_id}n{ndata.id}i{intf.ifindex}" + clab["topology"]["links"].append( + { + "endpoints": [ + f"{intf.node}:{_intf_clab_name(intf)}", + f"{bridge}:{bridge_intf}", + ] + } + ) + + +def _render_p2p_vxlan( + clab: dict, link: Box, local_sid: int, server_map: dict, local_intfs: list, assignment: dict, dstport: int +) -> None: + """Render a P2P cross-server link as a containerlab native VXLAN endpoint.""" + if not local_intfs: + return + + vni = link._ms.vni + local_intf = local_intfs[0] + + # Find the remote server + remote_sid = None + for intf in link.get("interfaces", []): + s = assignment.get(intf.node) + if s is not None and s != local_sid: + remote_sid = s + break + + if remote_sid is None: + return + + clab_name = _intf_clab_name(local_intf) + clab["topology"]["links"].append( + { + "endpoints": [ + f"{local_intf.node}:{clab_name}", + f"host:vx{vni}", + ], + "type": "vxlan", + "remote": str(server_map[remote_sid].host), + "vni": vni, + "udp-port": dstport, + } + ) + + +def _render_bridge_vxlan( + clab: dict, + link: Box, + local_sid: int, + server_map: dict, + local_intfs: list, + assignment: dict, + bridges_needed: set, + vxlan_tunnels: list, + dstport: int, + multilab_id: int, + topology: Box, +) -> None: + """Render a multi-access cross-server link: local bridge + host VXLAN tunnels.""" + vni = link._ms.vni + bridge = link.get("bridge", f"br{link.linkindex}") + + if not link.get("clab", {}).get("external_bridge", False): + bridges_needed.add(bridge) + + # Local node-to-bridge connections + for intf in local_intfs: + ndata = topology.nodes[intf.node] + bridge_intf = f"bni{multilab_id}n{ndata.id}i{intf.ifindex}" + clab["topology"]["links"].append( + { + "endpoints": [ + f"{intf.node}:{_intf_clab_name(intf)}", + f"{bridge}:{bridge_intf}", + ] + } + ) + + # VXLAN tunnels to each remote server that has endpoints on this link + remote_sids: set = set() + for intf in link.get("interfaces", []): + s = assignment.get(intf.node) + if s is not None and s != local_sid: + remote_sids.add(s) + + for rsid in sorted(remote_sids): + vxlan_tunnels.append( + { + "bridge": bridge, + "vni": vni, + "remote": str(server_map[rsid].host), + "dstport": dstport, + "remote_id": rsid, + } + ) + + +# --------------------------------------------------------------------------- +# File operations +# --------------------------------------------------------------------------- + + +def _write_server_snapshot(topology: Box, local_nodes: set, out_dir: str) -> None: + """Write a filtered netlab snapshot containing only this server's nodes. + + This allows 'netlab up --snapshot' to work correctly from a per-server + directory — only local nodes will be targeted for configuration deployment. + + Note: make_paths_absolute() must be called on the copy before pickling so + that the computed f_files / f_tasks / f_dirs keys are present in the + snapshot. The main netlab snapshot (outputs/pickle.py) is written *after* + create.py calls make_paths_absolute(), so it already contains those keys. + Plugin output() hooks run *before* that call, so we have to do it ourselves. + """ + from netsim import __version__ + from netsim.augment.config import make_paths_absolute + from netsim.augment.topology import cleanup_topology + + topo_copy = Box(topology, box_dots=True) + + # Filter nodes to only those on this server + topo_copy.nodes = Box({n: v for n, v in topo_copy.nodes.items() if n in local_nodes}, box_dots=True) + + # Filter links to only those with at least one local endpoint + topo_copy.links = [l for l in topo_copy.links if any(i.node in local_nodes for i in l.get("interfaces", []))] + + # Expand paths (add f_files / f_tasks / f_dirs computed keys). + # create.py calls make_paths_absolute() AFTER plugin output() hooks, so the + # main snapshot has these keys but our per-server copies don't yet. + # netlab initial relies on topology.defaults.paths.t_files.f_files, so we + # must add them before pickling. + make_paths_absolute(topo_copy.defaults.paths) + + # Remove prefix generators and serialize + cleaned = cleanup_topology(topo_copy) + topodict = cleaned.to_dict() + topodict["_netlab_version"] = __version__ + + with open(Path(out_dir) / "netlab.snapshot.pickle", "wb") as f: + pickle.dump(topodict, f) + + +def _write_vxlan_scripts(out_dir: str, tunnels: list, dev: str) -> None: + """Generate bash scripts to create/destroy host-level VXLAN tunnels.""" + + setup = [ + "#!/bin/bash", + "# VXLAN tunnel setup — generated by netlab multiserver plugin", + "# Run AFTER: sudo clab deploy -t clab.yml", + "#", + "# Creates host-level VXLAN tunnels and attaches them to containerlab bridges.", + "# These tunnels carry multi-access (bridged) cross-server traffic.", + "set -e", + "", + ] + + teardown = [ + "#!/bin/bash", + "# VXLAN tunnel teardown — generated by netlab multiserver plugin", + "# Run BEFORE: sudo clab destroy -t clab.yml", + "set -e", + "", + ] + + seen: set = set() + for t in tunnels: + vx_name = f"vxlan{t['vni']}" + key = (vx_name, t["remote"]) + if key in seen: + continue + seen.add(key) + + setup.extend( + [ + f"# VNI {t['vni']} -> {t['remote']} (server {t['remote_id']}) via bridge {t['bridge']}", + f"ip link add {vx_name} type vxlan id {t['vni']} remote {t['remote']} dev {dev} dstport {t['dstport']}", + f"ip link set {vx_name} master {t['bridge']}", + f"ip link set {vx_name} up", + f'echo " {vx_name} -> {t["bridge"]} (remote {t["remote"]})"', + "", + ] + ) + + teardown.append(f'ip link del {vx_name} 2>/dev/null && echo " deleted {vx_name}" || true') + + setup.append('echo "VXLAN setup complete."') + teardown.extend(["", 'echo "VXLAN teardown complete."']) + + for name, lines in [("vxlan-setup.sh", setup), ("vxlan-teardown.sh", teardown)]: + path = Path(out_dir) / name + path.write_text("\n".join(lines) + "\n") + os.chmod(path, 0o755) diff --git a/tests/topology/expected/multiserver-auto.yml b/tests/topology/expected/multiserver-auto.yml new file mode 100644 index 0000000000..747ec73d67 --- /dev/null +++ b/tests/topology/expected/multiserver-auto.yml @@ -0,0 +1,358 @@ +--- +_multiserver: + assignment: + g1_n1: 1 + g1_n2: 1 + g2_n1: 2 + g2_n2: 2 + replicated: + - mon_srv + server_map: + 1: + host: 192.168.128.1 + id: 1 + 2: + host: 192.168.128.2 + id: 2 +groups: + g1: + members: + - g1_n1 + - g1_n2 + g2: + members: + - g2_n1 + - g2_n2 +input: +- topology/input/multiserver-auto.yml +- package:topology-defaults.yml +links: +- _linkname: links[1] + _ms: + cross: false + servers: + - 1 + interfaces: + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.1/30 + node: g1_n1 + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.2/30 + node: g1_n2 + linkindex: 1 + node_count: 2 + prefix: + ipv4: 10.1.0.0/30 + type: p2p +- _linkname: links[2] + _ms: + cross: false + servers: + - 2 + interfaces: + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.5/30 + node: g2_n1 + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.6/30 + node: g2_n2 + linkindex: 2 + node_count: 2 + prefix: + ipv4: 10.1.0.4/30 + type: p2p +- _linkname: links[3] + _ms: + cross: false + servers: + - 1 + interfaces: + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.10/30 + node: mon_srv + - ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.9/30 + node: g1_n1 + linkindex: 3 + node_count: 2 + prefix: + ipv4: 10.1.0.8/30 + type: p2p +- _linkname: links[4] + _ms: + cross: false + servers: + - 2 + interfaces: + - ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.14/30 + node: mon_srv + - ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.13/30 + node: g2_n1 + linkindex: 4 + node_count: 2 + prefix: + ipv4: 10.1.0.12/30 + type: p2p +multiserver: + assignment: auto + output_dir: server-{server_id} + replicate: + - mon_srv + servers: + - host: 192.168.128.1 + id: 1 + - host: 192.168.128.2 + id: 2 + vxlan: + dev: ens33 + dstport: 4789 + vni_base: 10000 +name: input +nodes: + g1_n1: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-g1_n1 + id: 1 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.1/30 + linkindex: 1 + mac_address: caf0.0001.0001 + name: g1_n1 -> g1_n2 + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.2/30 + node: g1_n2 + type: p2p + - clab: + name: et2 + ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.9/30 + linkindex: 3 + mac_address: caf0.0001.0002 + name: g1_n1 -> mon_srv + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.10/30 + node: mon_srv + type: p2p + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.1/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.101 + mac: ca:fe:00:01:00:00 + name: g1_n1 + role: router + g1_n2: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-g1_n2 + id: 2 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.2/30 + linkindex: 1 + mac_address: caf0.0002.0001 + name: g1_n2 -> g1_n1 + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.1/30 + node: g1_n1 + type: p2p + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.2/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.102 + mac: ca:fe:00:02:00:00 + name: g1_n2 + role: router + g2_n1: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-g2_n1 + id: 3 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.5/30 + linkindex: 2 + mac_address: caf0.0003.0001 + name: g2_n1 -> g2_n2 + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.6/30 + node: g2_n2 + type: p2p + - clab: + name: et2 + ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.13/30 + linkindex: 4 + mac_address: caf0.0003.0002 + name: g2_n1 -> mon_srv + neighbors: + - ifname: Ethernet2 + ipv4: 10.1.0.14/30 + node: mon_srv + type: p2p + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.3/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.103 + mac: ca:fe:00:03:00:00 + name: g2_n1 + role: router + g2_n2: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-g2_n2 + id: 4 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.6/30 + linkindex: 2 + mac_address: caf0.0004.0001 + name: g2_n2 -> g2_n1 + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.5/30 + node: g2_n1 + type: p2p + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.4/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.104 + mac: ca:fe:00:04:00:00 + name: g2_n2 + role: router + mon_srv: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-mon_srv + id: 5 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.10/30 + linkindex: 3 + mac_address: caf0.0005.0001 + name: mon_srv -> g1_n1 + neighbors: + - ifname: Ethernet2 + ipv4: 10.1.0.9/30 + node: g1_n1 + type: p2p + - clab: + name: et2 + ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.14/30 + linkindex: 4 + mac_address: caf0.0005.0002 + name: mon_srv -> g2_n1 + neighbors: + - ifname: Ethernet2 + ipv4: 10.1.0.13/30 + node: g2_n1 + type: p2p + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.5/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.105 + mac: ca:fe:00:05:00:00 + name: mon_srv + role: router +plugin: +- multiserver +provider: clab diff --git a/tests/topology/expected/multiserver-explicit.yml b/tests/topology/expected/multiserver-explicit.yml new file mode 100644 index 0000000000..f81b0f51f4 --- /dev/null +++ b/tests/topology/expected/multiserver-explicit.yml @@ -0,0 +1,382 @@ +--- +_multiserver: + assignment: + s1_g1: 1 + s1_member: 1 + s2_g1: 2 + s2_member: 2 + replicated: [] + server_map: + 1: + groups: + - server1_nodes + host: 192.168.128.1 + id: 1 + members: + - s1_member + vxlan_dev: eth1 + 2: + groups: + - server2_nodes + host: 192.168.128.2 + id: 2 + members: + - s2_member + vxlan_dev: eth1 +groups: + server1_nodes: + members: + - s1_g1 + server2_nodes: + members: + - s2_g1 +input: +- topology/input/multiserver-explicit.yml +- package:topology-defaults.yml +links: +- _linkname: links[1] + _ms: + cross: false + servers: + - 1 + interfaces: + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.1/30 + node: s1_g1 + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.2/30 + node: s1_member + linkindex: 1 + node_count: 2 + prefix: + ipv4: 10.1.0.0/30 + type: p2p +- _linkname: links[2] + _ms: + cross: false + servers: + - 2 + interfaces: + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.5/30 + node: s2_g1 + - ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.6/30 + node: s2_member + linkindex: 2 + node_count: 2 + prefix: + ipv4: 10.1.0.4/30 + type: p2p +- _linkname: links[3] + _ms: + cross: true + servers: + - 1 + - 2 + vni: 20000 + interfaces: + - ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.9/30 + node: s1_g1 + - ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.10/30 + node: s2_g1 + linkindex: 3 + node_count: 2 + prefix: + ipv4: 10.1.0.8/30 + type: p2p +- _linkname: links[4] + _ms: + cross: true + servers: + - 1 + - 2 + vni: 20001 + bridge: input_4 + interfaces: + - ifindex: 2 + ifname: Ethernet2 + ipv4: 172.16.0.2/24 + node: s1_member + - ifindex: 2 + ifname: Ethernet2 + ipv4: 172.16.0.4/24 + node: s2_member + - ifindex: 3 + ifname: Ethernet3 + ipv4: 172.16.0.1/24 + node: s1_g1 + linkindex: 4 + node_count: 3 + prefix: + ipv4: 172.16.0.0/24 + type: lan +multiserver: + assignment: explicit + output_dir: server-{server_id} + replicate: [] + servers: + - groups: + - server1_nodes + host: 192.168.128.1 + id: 1 + members: + - s1_member + vxlan_dev: eth1 + - groups: + - server2_nodes + host: 192.168.128.2 + id: 2 + members: + - s2_member + vxlan_dev: eth1 + vxlan: + dev: eth1 + dstport: 4789 + vni_base: 20000 +name: input +nodes: + s1_g1: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-s1_g1 + id: 1 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.1/30 + linkindex: 1 + mac_address: caf0.0001.0001 + name: s1_g1 -> s1_member + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.2/30 + node: s1_member + type: p2p + - clab: + name: et2 + ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.9/30 + linkindex: 3 + mac_address: caf0.0001.0002 + name: s1_g1 -> s2_g1 + neighbors: + - ifname: Ethernet2 + ipv4: 10.1.0.10/30 + node: s2_g1 + type: p2p + - bridge: input_4 + clab: + name: et3 + ifindex: 3 + ifname: Ethernet3 + ipv4: 172.16.0.1/24 + linkindex: 4 + mac_address: caf0.0001.0003 + name: s1_g1 -> [s1_member,s2_member] + neighbors: + - ifname: Ethernet2 + ipv4: 172.16.0.2/24 + node: s1_member + - ifname: Ethernet2 + ipv4: 172.16.0.4/24 + node: s2_member + type: lan + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.1/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.101 + mac: ca:fe:00:01:00:00 + name: s1_g1 + role: router + s1_member: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-s1_member + id: 2 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.2/30 + linkindex: 1 + mac_address: caf0.0002.0001 + name: s1_member -> s1_g1 + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.1/30 + node: s1_g1 + type: p2p + - bridge: input_4 + clab: + name: et2 + ifindex: 2 + ifname: Ethernet2 + ipv4: 172.16.0.2/24 + linkindex: 4 + mac_address: caf0.0002.0002 + name: s1_member -> [s2_member,s1_g1] + neighbors: + - ifname: Ethernet2 + ipv4: 172.16.0.4/24 + node: s2_member + - ifname: Ethernet3 + ipv4: 172.16.0.1/24 + node: s1_g1 + type: lan + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.2/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.102 + mac: ca:fe:00:02:00:00 + name: s1_member + role: router + s2_g1: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-s2_g1 + id: 3 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.5/30 + linkindex: 2 + mac_address: caf0.0003.0001 + name: s2_g1 -> s2_member + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.6/30 + node: s2_member + type: p2p + - clab: + name: et2 + ifindex: 2 + ifname: Ethernet2 + ipv4: 10.1.0.10/30 + linkindex: 3 + mac_address: caf0.0003.0002 + name: s2_g1 -> s1_g1 + neighbors: + - ifname: Ethernet2 + ipv4: 10.1.0.9/30 + node: s1_g1 + type: p2p + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.3/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.103 + mac: ca:fe:00:03:00:00 + name: s2_g1 + role: router + s2_member: + af: + ipv4: true + box: ceos:4.34.2F + clab: + env: + CLAB_MGMT_VRF: management + INTFTYPE: et + kind: ceos + device: eos + hostname: clab-input-s2_member + id: 4 + interfaces: + - clab: + name: et1 + ifindex: 1 + ifname: Ethernet1 + ipv4: 10.1.0.6/30 + linkindex: 2 + mac_address: caf0.0004.0001 + name: s2_member -> s2_g1 + neighbors: + - ifname: Ethernet1 + ipv4: 10.1.0.5/30 + node: s2_g1 + type: p2p + - bridge: input_4 + clab: + name: et2 + ifindex: 2 + ifname: Ethernet2 + ipv4: 172.16.0.4/24 + linkindex: 4 + mac_address: caf0.0004.0002 + name: s2_member -> [s1_member,s1_g1] + neighbors: + - ifname: Ethernet2 + ipv4: 172.16.0.2/24 + node: s1_member + - ifname: Ethernet3 + ipv4: 172.16.0.1/24 + node: s1_g1 + type: lan + loopback: + ifindex: 0 + ifname: Loopback0 + ipv4: 10.0.0.4/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: Management0 + ipv4: 192.168.121.104 + mac: ca:fe:00:04:00:00 + name: s2_member + role: router +plugin: +- multiserver +provider: clab diff --git a/tests/topology/input/multiserver-auto.yml b/tests/topology/input/multiserver-auto.yml new file mode 100644 index 0000000000..a4d8739117 --- /dev/null +++ b/tests/topology/input/multiserver-auto.yml @@ -0,0 +1,45 @@ +--- +# Test automatic assignment and replication in multiserver plugin +# +provider: clab +plugin: [ multiserver ] + +multiserver: + servers: + - id: 1 + host: 192.168.128.1 + - id: 2 + host: 192.168.128.2 + assignment: auto + replicate: [ mon_srv ] + +groups: + g1: + members: [ g1_n1, g1_n2 ] + g2: + members: [ g2_n1, g2_n2 ] + +nodes: + g1_n1: + device: eos + g1_n2: + device: eos + g2_n1: + device: eos + g2_n2: + device: eos + mon_srv: + device: eos + +links: + # Internal links within group 1 +- g1_n1: + g1_n2: + # Internal links within group 2 +- g2_n1: + g2_n2: + # Connected to replicated node +- mon_srv: + g1_n1: +- mon_srv: + g2_n1: diff --git a/tests/topology/input/multiserver-explicit.yml b/tests/topology/input/multiserver-explicit.yml new file mode 100644 index 0000000000..b50830025c --- /dev/null +++ b/tests/topology/input/multiserver-explicit.yml @@ -0,0 +1,54 @@ +--- +# Test explicit assignment in multiserver plugin +# +provider: clab +plugin: [ multiserver ] + +multiserver: + servers: + - id: 1 + host: 192.168.128.1 + groups: [ server1_nodes ] + members: [ s1_member ] + vxlan_dev: eth1 + - id: 2 + host: 192.168.128.2 + groups: [ server2_nodes ] + members: [ s2_member ] + vxlan_dev: eth1 + assignment: explicit + vxlan: + vni_base: 20000 + dstport: 4789 + dev: eth1 + +groups: + server1_nodes: + members: [ s1_g1 ] + server2_nodes: + members: [ s2_g1 ] + +nodes: + s1_g1: + device: eos + s1_member: + device: eos + s2_g1: + device: eos + s2_member: + device: eos + +links: + # Local link on server 1 +- s1_g1: + s1_member: + # Local link on server 2 +- s2_g1: + s2_member: + # Cross-server P2P link +- s1_g1: + s2_g1: + # Cross-server multi-access bridge link (3+ endpoints across servers) +- s1_member: + s2_member: + s1_g1: From c0c2a1ad33247171a877fe29f88f3bade2a9efe5 Mon Sep 17 00:00:00 2001 From: Muddyblack Date: Wed, 20 May 2026 18:20:35 +0200 Subject: [PATCH 2/8] explain interface overriding --- docs/plugins/multiserver.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/plugins/multiserver.md b/docs/plugins/multiserver.md index 8503ae16f0..445262701f 100644 --- a/docs/plugins/multiserver.md +++ b/docs/plugins/multiserver.md @@ -60,6 +60,8 @@ Global VXLAN settings are specified in the **multiserver.vxlan** dictionary: | **dstport** | integer | UDP destination port for VXLAN traffic (default: `4789`) | | **dev** | string | Default physical interface to bind VXLAN tunnels (default: `ens33`) | +By default, VXLAN tunnels bind to the global default interface specified in **multiserver.vxlan.dev** (which falls back to `ens33` if not configured). If your physical servers use different interface names, you can override this interface per-server using the **vxlan_dev** parameter under each server in the **multiserver.servers** list. + (multiserver-assignment)= ## Assignment Modes @@ -194,12 +196,14 @@ multiserver: - id: 1 host: 192.168.168.128 groups: [ spines ] + vxlan_dev: ens33 # Override per-server (optional) - id: 2 host: 192.168.168.129 groups: [ leaves ] + vxlan_dev: eth0 # Override per-server (optional) vxlan: vni_base: 10000 - dev: ens33 + dev: ens33 # Global default interface ``` This places spines on server 1 and leaves on server 2. All four links cross servers and are provisioned as containerlab native VXLAN endpoints. From cdb71d53f43e29e73af3272b4dca38ff1e0ac39b Mon Sep 17 00:00:00 2001 From: Muddyblack Date: Wed, 20 May 2026 19:14:04 +0200 Subject: [PATCH 3/8] divide and conquer --- netsim/extra/multiserver/plugin.py | 482 ++++++++++++----------------- 1 file changed, 196 insertions(+), 286 deletions(-) diff --git a/netsim/extra/multiserver/plugin.py b/netsim/extra/multiserver/plugin.py index 888e39d96c..e246f87662 100644 --- a/netsim/extra/multiserver/plugin.py +++ b/netsim/extra/multiserver/plugin.py @@ -2,55 +2,7 @@ multiserver plugin — split a netlab topology across multiple physical servers. Generates per-server containerlab topology files with cross-server VXLAN links. -Requires containerlab >= 0.46 for native VXLAN link support. - -Cross-server links: - - * P2P links (2 endpoints) → containerlab native VXLAN (type: vxlan in clab.yml) - * Multi-access links (3+ endpoints, bridge) → local bridge + host-level VXLAN tunnel - created by a generated vxlan-setup.sh script - -Server assignment modes: - - * explicit (default) — user assigns nodes via groups/members, unassigned nodes cause - an error. Best when you need precise control over placement. - * auto — unassigned nodes are distributed round-robin across servers. Use this for - automatic splitting: just define the servers and let the plugin balance the nodes. - -Group granularity (auto mode): - - Auto mode keeps entire netlab groups together on one server. Define groups at - the smallest unit you want to keep on a single server. Parent/aggregate groups - are fine — child groups defined first will claim their members before the parent - is reached. See docs/plugins/multiserver.md for details and examples. - -Explicit assignment example: - - plugin: [ multiserver ] - - multiserver: - servers: - - id: 1 - host: 192.168.168.128 - groups: [ hubs ] - members: [ extra-node ] - - id: 2 - host: 10.0.0.67 - groups: [ spines, leaves ] - assignment: explicit - -Automatic splitting example (no groups/members needed): - - plugin: [ multiserver ] - - multiserver: - servers: - - id: 1 - host: 192.168.168.128 - - id: 2 - host: 10.0.0.67 - assignment: auto - replicate: [ prometheus, grafana ] +See docs/plugins/multiserver.md for usage, examples, and configuration reference. """ import os @@ -60,13 +12,13 @@ import yaml from box import Box -from packaging import version as _pv from netsim.data import append_to_list from netsim.utils import log _execute_after = ["fabric", "node.clone"] + # --------------------------------------------------------------------------- # Hook: init — validate config + register output hook # --------------------------------------------------------------------------- @@ -80,12 +32,9 @@ def init(topology: Box) -> None: # Merge plugin defaults with user config (user values take priority) defaults = topology.defaults.get("multiserver", Box({})) topology.multiserver = defaults + ms - ms = topology.multiserver servers = ms.get("servers", []) - # Currently only containerlab is supported — generating per-server Vagrantfiles - # for libvirt/virtualbox would require reimplementing the Vagrant Ruby DSL provider = topology.get("provider", "") or topology.defaults.get("provider", "") if provider and provider != "clab": log.error( @@ -96,19 +45,6 @@ def init(topology: Box) -> None: ) return - # Cross-server P2P links use containerlab native VXLAN endpoints (type: vxlan), - # available since containerlab 0.46. netlab already requires >= 0.75 so this - # should always pass, but check explicitly in case the requirement is relaxed. - clab_min = "0.46.0" - clab_ver = str(topology.defaults.providers.clab.get("version", "0.0.0")) - if _pv.Version(clab_ver) < _pv.Version(clab_min): - log.error( - f"multiserver plugin requires containerlab >= {clab_min} for VXLAN links (netlab targets {clab_ver})", - log.IncorrectValue, - "multiserver", - ) - return - if not servers: log.error('multiserver plugin requires a "servers" list', log.MissingValue, "multiserver") return @@ -117,18 +53,7 @@ def init(topology: Box) -> None: log.error("multiserver plugin requires at least 2 servers", log.IncorrectValue, "multiserver") return - seen_ids: set = set() - for idx, s in enumerate(servers): - if "id" not in s: - log.error(f'Server entry #{idx + 1} missing required "id" field', log.MissingValue, "multiserver") - continue - if "host" not in s: - log.error(f'Server {s.id} missing required "host" field', log.MissingValue, "multiserver") - continue - if s.id in seen_ids: - log.error(f"Duplicate server id {s.id}", log.IncorrectValue, "multiserver") - seen_ids.add(s.id) - + _validate_servers(servers) log.exit_on_error() # Register the output hook so netlab create calls our output() function @@ -145,54 +70,13 @@ def post_transform(topology: Box) -> None: if not ms: return - servers = ms.servers - server_map = {s.id: s for s in servers} - assignment: dict = {} # node_name -> server_id - # --- Resolve replicated nodes (present on every server) --- - replicated: set = set() - for entry in ms.get("replicate", []): - if entry in topology.nodes: - replicated.add(entry) - elif entry in topology.get("groups", {}): - for member in topology.groups[entry].get("members", []): - replicated.add(member) - else: - log.error(f'multiserver.replicate: "{entry}" is not a node or group', log.IncorrectValue, "multiserver") - - # --- Resolve assignments from server groups + members --- - for server in servers: - for gname in server.get("groups", []): - grp = topology.get("groups", {}).get(gname, None) - if grp is None: - log.error(f'Server {server.id} references unknown group "{gname}"', log.IncorrectValue, "multiserver") - continue - for member in grp.get("members", []): - if member in assignment and assignment[member] != server.id: - log.error( - f"Node {member} assigned to both server {assignment[member]} and {server.id}", - log.IncorrectValue, - "multiserver", - ) - assignment[member] = server.id - - for member in server.get("members", []): - if member not in topology.nodes: - log.error(f'Server {server.id} references unknown node "{member}"', log.IncorrectValue, "multiserver") - continue - if member in assignment and assignment[member] != server.id: - log.error( - f"Node {member} assigned to both server {assignment[member]} and {server.id}", - log.IncorrectValue, - "multiserver", - ) - assignment[member] = server.id - - # --- Handle unassigned nodes (replicated nodes are exempt) --- - unassigned = set(n for n in topology.nodes if n not in assignment and n not in replicated) + server_map = {s.id: s for s in ms.servers} + replicated = _resolve_replicated(ms, topology) + assignment = _resolve_assignments(ms.servers, topology) - mode = ms.get("assignment", "explicit") + unassigned = {n for n in topology.nodes if n not in assignment and n not in replicated} if unassigned: - if mode == "explicit": + if ms.get("assignment", "explicit") == "explicit": log.error( f"Nodes not assigned to any server: {', '.join(sorted(unassigned))}", log.MissingValue, @@ -203,112 +87,21 @@ def post_transform(topology: Box) -> None: ], ) else: - sorted_sids = sorted(server_map.keys()) - - # Distribute by netlab group to keep related nodes on the same server. - # Groups are assigned round-robin by size (largest first) for balance. - # Ungrouped nodes are distributed individually at the end. - group_buckets: list = [] # [(group_name, [members])] - claimed = set() - for gname, gdata in topology.get("groups", {}).items(): - members = [m for m in gdata.get("members", []) if m in unassigned and m not in claimed] - if members: - group_buckets.append((gname, members)) - claimed.update(members) - - # Sort groups largest-first for better balance - group_buckets.sort(key=lambda g: -len(g[1])) - - # Track node counts per server for balanced distribution - counts = {sid: sum(1 for s in assignment.values() if s == sid) for sid in sorted_sids} - - for gname, members in group_buckets: - # Assign entire group to the server with the fewest nodes - target = min(sorted_sids, key=lambda s: counts[s]) - for m in members: - assignment[m] = target - counts[target] += len(members) - - # Remaining ungrouped nodes: round-robin to least-loaded server - ungrouped = sorted(unassigned - claimed) - for name in ungrouped: - target = min(sorted_sids, key=lambda s: counts[s]) - assignment[name] = target - counts[target] += 1 + _auto_distribute(unassigned, server_map, assignment, topology) log.exit_on_error() - # --- Classify links: local vs cross-server --- vni_base = ms.vxlan.get("vni_base", 10000) - vni = vni_base - cross_count = 0 - - for link in topology.links: - link_servers = set() - for intf in link.get("interfaces", []): - if intf.node in replicated: - continue - sid = assignment.get(intf.node) - if sid is not None: - link_servers.add(sid) - - if len(link_servers) > 1: - link._ms = Box({"cross": True, "vni": vni, "servers": sorted(link_servers)}) - vni += 1 - cross_count += 1 - else: - link._ms = Box( - { - "cross": False, - "servers": sorted(link_servers), - } - ) - - if vni > 16777215: - log.error(f"VXLAN VNI overflow: {vni} exceeds 24-bit maximum (16777215)", log.IncorrectValue, "multiserver") - + cross_count = _classify_links(topology, assignment, replicated, vni_base) log.exit_on_error() - # Store state for output hook - topology._multiserver = Box( - { - "assignment": assignment, - "server_map": server_map, - "replicated": sorted(replicated), - } - ) - - # Summary — show which groups and nodes landed on each server - for server in servers: - sid = server.id - server_nodes = sorted(n for n, s in assignment.items() if s == sid) + topology._multiserver = Box({ + "assignment": assignment, + "server_map": server_map, + "replicated": sorted(replicated), + }) - # Figure out which netlab groups are fully on this server - server_groups = [] - for gname, gdata in topology.get("groups", {}).items(): - members = gdata.get("members", []) - if not members: - continue - on_this = [m for m in members if assignment.get(m) == sid] - if on_this and len(on_this) == len([m for m in members if m in assignment]): - server_groups.append(gname) - - n = len(server_nodes) - log.info(f"Server {sid} ({server.host}): {n} nodes", module="multiserver") - if server_groups: - preview = server_groups[:8] - suffix = f" ... +{len(server_groups) - 8} more" if len(server_groups) > 8 else "" - log.info(f" groups: {', '.join(preview)}{suffix}", module="multiserver") - if n <= 20: - log.info(f" nodes: {', '.join(server_nodes)}", module="multiserver") - else: - preview = server_nodes[:6] - log.info(f" nodes: {', '.join(preview)} ... +{n - 6} more", module="multiserver") - - if replicated: - log.info(f"Replicated on all servers: {', '.join(sorted(replicated))}", module="multiserver") - if cross_count: - log.info(f"{cross_count} cross-server links (VNI {vni_base}–{vni - 1})", module="multiserver") + _log_assignment_summary(ms, assignment, replicated, topology, vni_base, cross_count) # --------------------------------------------------------------------------- @@ -380,13 +173,11 @@ def output(topology: Box) -> None: def _distribute_files_atexit(lab_folder: str, server_folders: list) -> None: - """Distribute generated files (node_files, host_vars, ansible.cfg, hosts.yml) - to each server folder. Registered via atexit so it runs AFTER netlab has - written all output files. - """ + """Distribute generated files""" lab_path = Path(lab_folder) nf_dir = lab_path / "node_files" hv_dir = lab_path / "host_vars" + server_names = {Path(sf).name for sf, _ in server_folders} for sf, local_nodes in server_folders: sf_path = Path(sf) @@ -399,15 +190,7 @@ def _distribute_files_atexit(lab_folder: str, server_folders: list) -> None: dst_nf.mkdir(exist_ok=True) for item in nf_dir.iterdir(): if item.name in local_nodes or item.name.startswith("-"): - dst = dst_nf / item.name - if not dst.exists(): - try: - if item.is_dir(): - shutil.copytree(item, dst) - else: - shutil.copy2(item, dst) - except Exception: - pass + _copy_if_missing(item, dst_nf / item.name) # host_vars: per-node only if hv_dir.is_dir(): @@ -415,48 +198,189 @@ def _distribute_files_atexit(lab_folder: str, server_folders: list) -> None: dst_hv.mkdir(exist_ok=True) for item in hv_dir.iterdir(): if item.name in local_nodes: - dst = dst_hv / item.name - if not dst.exists(): - try: - if item.is_dir(): - shutil.copytree(item, dst) - else: - shutil.copy2(item, dst) - except Exception: - pass + _copy_if_missing(item, dst_hv / item.name) # Copy all other subdirectories (e.g. group_vars, templates, monitoring) # excluding server folders, node_files, host_vars, and python/git metadata. - server_names = {Path(sf).name for sf, _ in server_folders} + skip = server_names | {"node_files", "host_vars", "__pycache__", ".git"} for item in lab_path.iterdir(): - if item.is_dir(): - if item.name in server_names or item.name in ("node_files", "host_vars", "__pycache__", ".git"): - continue - - # Optimization: only copy grafana directory if this server hosts the grafana node - if item.name == "grafana" and "grafana" not in local_nodes: - continue - - dst_dir = sf_path / item.name - if not dst_dir.exists(): - try: - shutil.copytree(item, dst_dir) - except Exception: - pass + if not item.is_dir() or item.name in skip: + continue + # Only copy grafana directory if this server hosts the grafana node + if item.name == "grafana" and "grafana" not in local_nodes: + continue + _copy_if_missing(item, sf_path / item.name) # Ansible inventory and config for fname in ("ansible.cfg", "hosts.yml"): src = lab_path / fname - dst = sf_path / fname - if src.exists() and not dst.exists(): - try: - shutil.copy2(src, dst) - except Exception: - pass + if src.exists(): + _copy_if_missing(src, sf_path / fname) + + +def _copy_if_missing(src: Path, dst: Path) -> None: + if dst.exists(): + return + try: + if src.is_dir(): + shutil.copytree(src, dst) + else: + shutil.copy2(src, dst) + except Exception: + pass # =========================================================================== -# Internal helpers +# Internal helpers — post_transform +# =========================================================================== + + +def _validate_servers(servers: list) -> None: + seen_ids: set = set() + for idx, s in enumerate(servers): + if "id" not in s: + log.error(f'Server entry #{idx + 1} missing required "id" field', log.MissingValue, "multiserver") + continue + if "host" not in s: + log.error(f'Server {s.id} missing required "host" field', log.MissingValue, "multiserver") + continue + if s.id in seen_ids: + log.error(f"Duplicate server id {s.id}", log.IncorrectValue, "multiserver") + seen_ids.add(s.id) + + +def _resolve_replicated(ms: Box, topology: Box) -> set: + replicated: set = set() + for entry in ms.get("replicate", []): + if entry in topology.nodes: + replicated.add(entry) + elif entry in topology.get("groups", {}): + for member in topology.groups[entry].get("members", []): + replicated.add(member) + else: + log.error(f'multiserver.replicate: "{entry}" is not a node or group', log.IncorrectValue, "multiserver") + return replicated + + +def _resolve_assignments(servers: list, topology: Box) -> dict: + assignment: dict = {} + for server in servers: + for gname in server.get("groups", []): + grp = topology.get("groups", {}).get(gname, None) + if grp is None: + log.error(f'Server {server.id} references unknown group "{gname}"', log.IncorrectValue, "multiserver") + continue + for member in grp.get("members", []): + if member in assignment and assignment[member] != server.id: + log.error( + f"Node {member} assigned to both server {assignment[member]} and {server.id}", + log.IncorrectValue, + "multiserver", + ) + assignment[member] = server.id + + for member in server.get("members", []): + if member not in topology.nodes: + log.error(f'Server {server.id} references unknown node "{member}"', log.IncorrectValue, "multiserver") + continue + if member in assignment and assignment[member] != server.id: + log.error( + f"Node {member} assigned to both server {assignment[member]} and {server.id}", + log.IncorrectValue, + "multiserver", + ) + assignment[member] = server.id + + return assignment + + +def _auto_distribute(unassigned: set, server_map: dict, assignment: dict, topology: Box) -> None: + """Distribute unassigned nodes across servers, keeping netlab groups together.""" + sorted_sids = sorted(server_map.keys()) + counts = {sid: sum(1 for s in assignment.values() if s == sid) for sid in sorted_sids} + + # Build group buckets: keep group members together, distribute largest groups first + claimed: set = set() + group_buckets: list = [] + for gdata in topology.get("groups", {}).values(): + members = [m for m in gdata.get("members", []) if m in unassigned and m not in claimed] + if members: + group_buckets.append(members) + claimed.update(members) + group_buckets.sort(key=lambda g: -len(g)) + + for members in group_buckets: + target = min(sorted_sids, key=lambda s: counts[s]) + for m in members: + assignment[m] = target + counts[target] += len(members) + + # Remaining ungrouped nodes: one by one to least-loaded server + for name in sorted(unassigned - claimed): + target = min(sorted_sids, key=lambda s: counts[s]) + assignment[name] = target + counts[target] += 1 + + +def _classify_links(topology: Box, assignment: dict, replicated: set, vni_base: int) -> int: + """Assign _ms metadata to each link; return the number of cross-server links.""" + vni = vni_base + for link in topology.links: + link_servers = { + assignment[i.node] + for i in link.get("interfaces", []) + if i.node not in replicated and i.node in assignment + } + if len(link_servers) > 1: + link._ms = Box({"cross": True, "vni": vni, "servers": sorted(link_servers)}) + vni += 1 + else: + link._ms = Box({"cross": False, "servers": sorted(link_servers)}) + + if vni > 16777215: + log.error(f"VXLAN VNI overflow: {vni} exceeds 24-bit maximum (16777215)", log.IncorrectValue, "multiserver") + + return vni - vni_base + + +def _log_assignment_summary( + ms: Box, assignment: dict, replicated: set, topology: Box, vni_base: int, cross_count: int +) -> None: + for server in ms.servers: + sid = server.id + server_nodes = sorted(n for n, s in assignment.items() if s == sid) + n = len(server_nodes) + + server_groups = [] + for gname, gdata in topology.get("groups", {}).items(): + members = gdata.get("members", []) + if not members: + continue + on_this = [m for m in members if assignment.get(m) == sid] + assigned = [m for m in members if m in assignment] + if on_this and len(on_this) == len(assigned): + server_groups.append(gname) + + log.info(f"Server {sid} ({server.host}): {n} nodes", module="multiserver") + if server_groups: + preview = server_groups[:8] + suffix = f" ... +{len(server_groups) - 8} more" if len(server_groups) > 8 else "" + log.info(f" groups: {', '.join(preview)}{suffix}", module="multiserver") + if n <= 20: + log.info(f" nodes: {', '.join(server_nodes)}", module="multiserver") + else: + log.info(f" nodes: {', '.join(server_nodes[:6])} ... +{n - 6} more", module="multiserver") + + if replicated: + log.info(f"Replicated on all servers: {', '.join(sorted(replicated))}", module="multiserver") + if cross_count: + log.info( + f"{cross_count} cross-server links (VNI {vni_base}–{vni_base + cross_count - 1})", module="multiserver" + ) + + +# =========================================================================== +# Internal helpers — clab.yml generation # =========================================================================== @@ -727,12 +651,7 @@ def _render_bridge_vxlan( ) # VXLAN tunnels to each remote server that has endpoints on this link - remote_sids: set = set() - for intf in link.get("interfaces", []): - s = assignment.get(intf.node) - if s is not None and s != local_sid: - remote_sids.add(s) - + remote_sids = {assignment[i.node] for i in link.get("interfaces", []) if assignment.get(i.node) not in (None, local_sid)} for rsid in sorted(remote_sids): vxlan_tunnels.append( { @@ -751,16 +670,11 @@ def _render_bridge_vxlan( def _write_server_snapshot(topology: Box, local_nodes: set, out_dir: str) -> None: - """Write a filtered netlab snapshot containing only this server's nodes. - - This allows 'netlab up --snapshot' to work correctly from a per-server - directory — only local nodes will be targeted for configuration deployment. + """Write a filtered netlab snapshot for this server's nodes only. - Note: make_paths_absolute() must be called on the copy before pickling so - that the computed f_files / f_tasks / f_dirs keys are present in the - snapshot. The main netlab snapshot (outputs/pickle.py) is written *after* - create.py calls make_paths_absolute(), so it already contains those keys. - Plugin output() hooks run *before* that call, so we have to do it ourselves. + Allows 'netlab up --snapshot' to work from a per-server directory. + make_paths_absolute() is called here explicitly because output() hooks run + before create.py does it — without it the snapshot is missing f_files/f_tasks/f_dirs. """ from netsim import __version__ from netsim.augment.config import make_paths_absolute @@ -775,10 +689,6 @@ def _write_server_snapshot(topology: Box, local_nodes: set, out_dir: str) -> Non topo_copy.links = [l for l in topo_copy.links if any(i.node in local_nodes for i in l.get("interfaces", []))] # Expand paths (add f_files / f_tasks / f_dirs computed keys). - # create.py calls make_paths_absolute() AFTER plugin output() hooks, so the - # main snapshot has these keys but our per-server copies don't yet. - # netlab initial relies on topology.defaults.paths.t_files.f_files, so we - # must add them before pickling. make_paths_absolute(topo_copy.defaults.paths) # Remove prefix generators and serialize From c9f2a6f4c5491a0b47189252db5c4fc97295ec04 Mon Sep 17 00:00:00 2001 From: Muddyblack Date: Wed, 20 May 2026 19:39:37 +0200 Subject: [PATCH 4/8] rm --- from yml --- tests/topology/expected/multiserver-auto.yml | 1 - tests/topology/expected/multiserver-explicit.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/topology/expected/multiserver-auto.yml b/tests/topology/expected/multiserver-auto.yml index 747ec73d67..85559675e2 100644 --- a/tests/topology/expected/multiserver-auto.yml +++ b/tests/topology/expected/multiserver-auto.yml @@ -1,4 +1,3 @@ ---- _multiserver: assignment: g1_n1: 1 diff --git a/tests/topology/expected/multiserver-explicit.yml b/tests/topology/expected/multiserver-explicit.yml index f81b0f51f4..70fcb7beec 100644 --- a/tests/topology/expected/multiserver-explicit.yml +++ b/tests/topology/expected/multiserver-explicit.yml @@ -1,4 +1,3 @@ ---- _multiserver: assignment: s1_g1: 1 From 411ccb6b1045f522da7e2b759ff4895db53e15f3 Mon Sep 17 00:00:00 2001 From: Muddyblack Date: Mon, 1 Jun 2026 18:06:11 +0200 Subject: [PATCH 5/8] Refactor and review-edits --- docs/plugins/multiserver.md | 101 ++- netsim/data/types.py | 38 + netsim/extra/multiserver/defaults.yml | 33 +- netsim/extra/multiserver/plugin.py | 768 ++++++++---------- netsim/extra/multiserver/vxlan-setup.j2 | 19 + netsim/extra/multiserver/vxlan-teardown.j2 | 10 + netsim/templates/provider/clab/clab.j2 | 26 +- tests/topology/expected/multiserver-auto.yml | 18 +- .../expected/multiserver-explicit.yml | 303 +++++-- tests/topology/input/multiserver-auto.yml | 8 +- tests/topology/input/multiserver-explicit.yml | 36 +- 11 files changed, 808 insertions(+), 552 deletions(-) create mode 100644 netsim/extra/multiserver/vxlan-setup.j2 create mode 100644 netsim/extra/multiserver/vxlan-teardown.j2 diff --git a/docs/plugins/multiserver.md b/docs/plugins/multiserver.md index 445262701f..93686b767f 100644 --- a/docs/plugins/multiserver.md +++ b/docs/plugins/multiserver.md @@ -11,18 +11,16 @@ The *multiserver* plugin distributes a single *netlab* topology across multiple ``` ```{warning} -* The *multiserver* plugin requires the **containerlab** provider on all servers. -* Containerlab version >= `0.46` is required for native VXLAN link endpoint support. * All physical servers must have direct IP reachability (e.g. over a management network or dedicated interconnect). ``` ## Using the Plugin * Add `plugin: [ multiserver ]` to lab topology. -* Define target servers in the **multiserver.servers** list. +* Define target servers in the **multiserver.servers** dictionary. * Choose an assignment mode (`explicit` or `auto`) with **multiserver.assignment**. -The plugin runs during `netlab create` and generates self-contained per-server directories (e.g. `server-1/`, `server-2/`) with tailored `clab.yml` files, node configs, and VXLAN scripts ready for deployment. +The plugin runs during `netlab create` and generates self-contained per-server directories (e.g. `server-srv1/`, `server-srv2/`) with tailored `clab.yml` files, node configs, and VXLAN scripts ready for deployment. ## Configuring Plugin Parameters @@ -31,23 +29,28 @@ The plugin is configured with the **multiserver** topology-level dictionary that | Parameter | Type | Meaning | |-----------|------|---------| | **assignment** | string | How to assign nodes to servers: `explicit` (default) or `auto` | -| **servers** | list | List of target physical servers | +| **servers** | dictionary | Target physical servers, keyed by server name | | **vxlan** | dictionary | Global settings for VXLAN tunnels | | **replicate** | list | Nodes or groups that must be duplicated on all servers | -| **output_dir** | string | Template for per-server directory names (default: `server-{server_id}`) | +| **output_dir** | string | Template for per-server directory names (default: `server-{server_name}`); supports `{server_name}`, `{server_id}`, and `{name}` (topology name) | +| **copy_dirs** | list | Subdirectories copied into every server directory (default: `[group_vars, templates]`); overrides the default list | +| **copy_files** | list | Top-level files copied into every server directory (default: `[ansible.cfg]`); overrides the default list | +| **extra_copy_dirs** | list | Additional subdirectories to copy on top of **copy_dirs** | +| **extra_copy_files** | list | Additional top-level files to copy on top of **copy_files** | (multiserver-servers)= ### Server Parameters -Each entry in the **multiserver.servers** list supports these parameters: +The **multiserver.servers** dictionary is keyed by server name (e.g. `srv1`, `dc-east`). The name is used for per-server directory names and log messages, and because servers are a dictionary, duplicate server names are impossible. Each entry supports these parameters: | Parameter | Type | Meaning | |-----------|------|---------| -| **id** | integer | Unique identifier for the server (e.g. `1`, `2`) | +| **id** | integer | Numeric identifier used for VXLAN bookkeeping; auto-assigned if omitted | | **host** | string | IP address or hostname of the remote server | | **groups** | list | *netlab* groups whose members are assigned to this server | | **members** | list | Individual node names assigned to this server | | **vxlan_dev** | string | Physical interface to bind VXLAN tunnels to on this server | +| **weight** | integer | Relative capacity for auto-assignment (default: `1`); a server with `weight: 2` absorbs twice as many nodes before being considered as loaded as a server with `weight: 1` | (multiserver-vxlan)= ### VXLAN Parameters @@ -60,7 +63,7 @@ Global VXLAN settings are specified in the **multiserver.vxlan** dictionary: | **dstport** | integer | UDP destination port for VXLAN traffic (default: `4789`) | | **dev** | string | Default physical interface to bind VXLAN tunnels (default: `ens33`) | -By default, VXLAN tunnels bind to the global default interface specified in **multiserver.vxlan.dev** (which falls back to `ens33` if not configured). If your physical servers use different interface names, you can override this interface per-server using the **vxlan_dev** parameter under each server in the **multiserver.servers** list. +By default, VXLAN tunnels bind to the global default interface specified in **multiserver.vxlan.dev** (which falls back to `ens33` if not configured). If your physical servers use different interface names, you can override this interface per-server using the **vxlan_dev** parameter under each server in the **multiserver.servers** dictionary. (multiserver-assignment)= ## Assignment Modes @@ -75,11 +78,11 @@ plugin: [ multiserver ] multiserver: assignment: explicit servers: - - id: 1 + srv1: host: 192.168.168.128 groups: [ core ] members: [ edge-node ] - - id: 2 + srv2: host: 192.168.168.129 groups: [ spines, leaves ] ``` @@ -88,10 +91,10 @@ multiserver: In `auto` mode, nodes that are not explicitly pinned to a server are distributed automatically using a greedy balancing algorithm: -1. Nodes belonging to a *netlab* group are kept together — the entire group is placed on the server that currently has the fewest nodes. Larger groups are placed first for better balance. +1. Nodes belonging to a *netlab* group are kept together — the entire group is placed on the server with the lowest current load. Larger groups are placed first for better balance. 2. Remaining ungrouped nodes are assigned one at a time to the least-loaded server. -Nodes already pinned via **groups** or **members** attributes count toward server load, so the algorithm balances around any explicit assignments. +**Load** is defined as `(assigned node count) / weight`, where **weight** defaults to `1`. Nodes already pinned via **groups** or **members** attributes count toward server load, so the algorithm balances around any explicit assignments. ```yaml plugin: [ multiserver ] @@ -99,12 +102,26 @@ plugin: [ multiserver ] multiserver: assignment: auto servers: - - id: 1 + srv1: host: 192.168.168.128 - - id: 2 + srv2: host: 192.168.168.129 ``` +Use **weight** to account for servers with different capacities. A server with `weight: 2` is treated as twice as capable and absorbs proportionally more nodes before being considered equally loaded: + +```yaml +multiserver: + assignment: auto + servers: + srv1: + host: 192.168.168.128 + weight: 1 # smaller server + srv2: + host: 192.168.168.129 + weight: 2 # larger server — gets roughly twice as many nodes +``` + ```{tip} You can pin specific nodes or groups to a server in `auto` mode using **groups** and **members** attributes. Only unassigned nodes are auto-distributed. ``` @@ -152,9 +169,9 @@ Links connecting to replicated nodes are always treated as local, so traffic bet multiserver: assignment: auto servers: - - id: 1 + srv1: host: 192.168.168.128 - - id: 2 + srv2: host: 192.168.168.129 replicate: [ prometheus, grafana ] ``` @@ -193,11 +210,11 @@ links: multiserver: assignment: explicit servers: - - id: 1 + spine-host: host: 192.168.168.128 groups: [ spines ] vxlan_dev: ens33 # Override per-server (optional) - - id: 2 + leaf-host: host: 192.168.168.129 groups: [ leaves ] vxlan_dev: eth0 # Override per-server (optional) @@ -206,7 +223,7 @@ multiserver: dev: ens33 # Global default interface ``` -This places spines on server 1 and leaves on server 2. All four links cross servers and are provisioned as containerlab native VXLAN endpoints. +This places spines on `spine-host` and leaves on `leaf-host`. All four links cross servers and are provisioned as containerlab native VXLAN endpoints. ## Behind the Scenes @@ -220,12 +237,18 @@ Each per-server directory is self-contained and includes: * A tailored `clab.yml` with only the relevant nodes and cross-server VXLAN interfaces * A filtered `netlab.snapshot.pickle` for use with `netlab up --snapshot` -* Copies of `node_files/`, `host_vars/`, and Ansible config for only the nodes on that server -* `vxlan-setup.sh` and `vxlan-teardown.sh` scripts (when multi-access VXLAN tunnels are needed) +* A filtered `hosts.yml` containing only the nodes assigned to that server, so `netlab initial` does not attempt to configure nodes on other servers +* Copies of `node_files/` and `host_vars/` for only the nodes on that server +* Copies of the directories and files listed in **multiserver.copy_dirs** and **multiserver.copy_files** +* Per-server `vxlan-setup.sh` and `vxlan-teardown.sh` scripts (when multi-access VXLAN tunnels are needed), registered in that server's snapshot as [CLI hooks](dev-cli-hooks) (`netlab.up.post_start_clab` / `netlab.down.pre_stop_clab`) so `netlab up` and `netlab down` run them automatically on the remote host (multiserver-deployment)= ## Deployment Workflow +```{note} +The plugin does **not** orchestrate remote servers. It runs only on the control node during `netlab create`, where it generates a self-contained directory per server. It never opens SSH connections, runs commands remotely, or copies files to other hosts. You copy each directory to its server yourself (Step 2), and `netlab` then runs **independently on each server** (Step 3) — the per-server VXLAN CLI hooks fire locally on that server, not from the control node. +``` + **Step 1: Generate configurations** on your workstation: ```bash @@ -237,17 +260,18 @@ The plugin automatically copies all required files into each server directory **Step 2: Copy server directories to remote hosts** (e.g. via rsync): ```bash -rsync -avz server-1/ user@192.168.168.128:~/lab/server-1/ -rsync -avz server-2/ user@192.168.168.129:~/lab/server-2/ +rsync -avz server-spine-host/ user@192.168.168.128:~/lab/server-spine-host/ +rsync -avz server-leaf-host/ user@192.168.168.129:~/lab/server-leaf-host/ ``` **Step 3: Deploy on each server** by running the following on each remote host: ```bash sudo netlab up --snapshot -vv -sudo ./vxlan-setup.sh # only if multi-access VXLAN tunnels are present ``` +When multi-access VXLAN tunnels are present, `netlab up` runs `vxlan-setup.sh` automatically via a [CLI hook](dev-cli-hooks) registered by the plugin. + ```{important} **Why is `--snapshot` required on remote servers?** @@ -258,13 +282,34 @@ Running with `topology.yml` directly on remote servers will fail because: 2. **Recursion**: Running `netlab create` on `topology.yml` on the remote hosts would execute the `multiserver` plugin again, causing it to split the topology recursively and generate nested server subdirectories. ``` -**Teardown** in reverse order: +**Teardown** on each server: ```bash -sudo ./vxlan-teardown.sh -sudo clab destroy -t clab.yml +sudo netlab down ``` +When multi-access VXLAN tunnels are present, `netlab down` runs `vxlan-teardown.sh` automatically via a CLI hook registered by the plugin. + +## Customising What Gets Copied + +By default, the plugin copies `group_vars/` and `templates/` subdirectories, plus `ansible.cfg`, into every server directory. To add extra items on top of the defaults, use **extra_copy_dirs** and **extra_copy_files**: + +```yaml +multiserver: + extra_copy_dirs: [ monitoring ] + extra_copy_files: [ netlab.lock ] +``` + +To replace the defaults entirely, use **copy_dirs** and **copy_files**: + +```yaml +multiserver: + copy_dirs: [ group_vars, templates, monitoring ] + copy_files: [ ansible.cfg, netlab.lock ] +``` + +The Ansible inventory (`hosts.yml`) is always written into each server directory and is automatically filtered to contain only the nodes assigned to that server. + ## Limitations * Only the **containerlab** provider is supported. Libvirt and virtualbox topologies cannot be split across servers. diff --git a/netsim/data/types.py b/netsim/data/types.py index b4b45e0ba4..69ba3538e0 100644 --- a/netsim/data/types.py +++ b/netsim/data/types.py @@ -996,6 +996,44 @@ def must_be_node_id(value: typing.Any) -> dict: return { '_valid': True } +@type_test() +def must_be_group_id(value: typing.Any) -> dict: + if not isinstance(value,str): # Otherwise it must be a string + return { '_type': 'valid group name (a string)' } + + topology = global_vars.get_topology() # Try to get current lab topology + if topology is None: # pragma: no-cover + log.fatal('Calling group_id validation before the topology has been initialized') + + if value not in topology.get('groups',{}): + return { + '_type': "group", + '_value': f"valid group name (found {value})", + '_hint_id': "groups", + '_hint': "Valid group names are "+", ".join(list(topology.get('groups',{}))) + } + + return { '_valid': True } + +@type_test() +def must_be_node_or_group(value: typing.Any) -> dict: + if not isinstance(value,str): # Otherwise it must be a string + return { '_type': 'valid node or group name (a string)' } + + topology = global_vars.get_topology() # Try to get current lab topology + if topology is None: # pragma: no-cover + log.fatal('Calling node_or_group validation before the topology has been initialized') + + if value not in topology.nodes and value not in topology.get('groups',{}): + return { + '_type': "node or group", + '_value': f"valid node or group name (found {value})", + '_hint_id': "node_or_group", + '_hint': "Valid node or group names are "+", ".join(list(topology.nodes) + list(topology.get('groups',{}))) + } + + return { '_valid': True } + @type_test() def must_be_r_proto(value: typing.Any) -> dict: if not isinstance(value,str): diff --git a/netsim/extra/multiserver/defaults.yml b/netsim/extra/multiserver/defaults.yml index bd68aaa3b4..b800cb537e 100644 --- a/netsim/extra/multiserver/defaults.yml +++ b/netsim/extra/multiserver/defaults.yml @@ -15,33 +15,54 @@ attributes: global: multiserver: servers: - type: list + type: dict _subtype: id: type: int - _required: True host: type: str _required: True - groups: list - members: list + groups: + type: list + _subtype: group_id + members: + type: list + _subtype: node_id vxlan_dev: str + weight: + type: int + _min: 1 vxlan: vni_base: int dstport: int dev: str + auto_start: bool assignment: type: str valid_values: [explicit, auto] - replicate: list + replicate: + type: list + _subtype: node_or_group output_dir: type: str + copy_dirs: list + copy_files: list + extra_copy_dirs: list + extra_copy_files: list multiserver: vxlan: vni_base: 10000 dstport: 4789 dev: ens33 + # When true, 'netlab up --snapshot' auto-runs vxlan-setup.sh via a CLI hook. + # Set false to keep cross-server tunnels inactive until you run the script + # manually (e.g. to stage convergence or connect servers on your own schedule). + auto_start: true assignment: explicit replicate: [] - output_dir: "server-{server_id}" + output_dir: "server-{server_name}" + # Subdirectories always copied into every server directory + copy_dirs: [ group_vars, templates ] + # Top-level files always copied into every server directory + copy_files: [ ansible.cfg ] diff --git a/netsim/extra/multiserver/plugin.py b/netsim/extra/multiserver/plugin.py index e246f87662..7bb4e1570f 100644 --- a/netsim/extra/multiserver/plugin.py +++ b/netsim/extra/multiserver/plugin.py @@ -14,7 +14,9 @@ from box import Box from netsim.data import append_to_list -from netsim.utils import log +from netsim.modules import _dataplane +from netsim.utils import files as _files +from netsim.utils import log, templates _execute_after = ["fabric", "node.clone"] @@ -33,24 +35,10 @@ def init(topology: Box) -> None: defaults = topology.defaults.get("multiserver", Box({})) topology.multiserver = defaults + ms ms = topology.multiserver - servers = ms.get("servers", []) - - provider = topology.get("provider", "") or topology.defaults.get("provider", "") - if provider and provider != "clab": - log.error( - f'multiserver plugin currently supports only the "clab" provider, not "{provider}"', - log.IncorrectValue, - "multiserver", - more_hints=["libvirt and virtualbox support may be added in a future release"], - ) - return + servers = ms.get("servers", Box({})) if not servers: - log.error('multiserver plugin requires a "servers" list', log.MissingValue, "multiserver") - return - - if len(servers) < 2: - log.error("multiserver plugin requires at least 2 servers", log.IncorrectValue, "multiserver") + log.error('multiserver plugin requires a "servers" dictionary', log.MissingValue, "multiserver") return _validate_servers(servers) @@ -70,7 +58,17 @@ def post_transform(topology: Box) -> None: if not ms: return - server_map = {s.id: s for s in ms.servers} + # topology.provider is guaranteed to be set by post_transform time + if topology.provider != "clab": + log.error( + f'multiserver plugin currently supports only the "clab" provider, not "{topology.provider}"', + log.IncorrectValue, + "multiserver", + more_hints=["libvirt and virtualbox support may be added in a future release"], + ) + return + + server_map = {s.id: s for s in ms.servers.values()} replicated = _resolve_replicated(ms, topology) assignment = _resolve_assignments(ms.servers, topology) @@ -95,11 +93,12 @@ def post_transform(topology: Box) -> None: cross_count = _classify_links(topology, assignment, replicated, vni_base) log.exit_on_error() - topology._multiserver = Box({ + # Assigning a plain dict auto-converts to Box (default_box=True, box_dots=True) + topology._multiserver = { "assignment": assignment, "server_map": server_map, "replicated": sorted(replicated), - }) + } _log_assignment_summary(ms, assignment, replicated, topology, vni_base, cross_count) @@ -118,116 +117,186 @@ def output(topology: Box) -> None: assignment = ms_data.assignment server_map = ms_data.server_map vxlan_cfg = ms.vxlan - out_tpl = ms.get("output_dir", "server-{server_id}") + out_tpl = ms.get("output_dir", "server-{server_name}") replicated = set(ms_data.get("replicated", [])) server_folders = [] - for server in ms.servers: + for sname, server in ms.servers.items(): sid = server.id local_nodes = {n for n, s in assignment.items() if s == sid} | replicated if not local_nodes: continue - out_dir = out_tpl.format(name=topology.name, server_id=sid) + out_dir = out_tpl.format(name=topology.name, server_name=sname, server_id=sid) server_folders.append((out_dir, local_nodes)) if Path(out_dir).exists(): shutil.rmtree(out_dir) Path(out_dir).mkdir(parents=True, exist_ok=True) - clab_dict, vxlan_tunnels = _build_server_clab(topology, local_nodes, sid, server_map, vxlan_cfg) - - # Write clab.yml - with open(Path(out_dir) / "clab.yml", "w") as f: - yaml.dump(clab_dict, f, default_flow_style=False, sort_keys=False, indent=2) + topo_copy, vxlan_tunnels = _build_server_topo(topology, local_nodes, sid, server_map, vxlan_cfg) - # Write filtered snapshot so 'netlab up --snapshot' works per-server - _write_server_snapshot(topology, local_nodes, out_dir) + # Write clab.yml via the standard clab.j2 template + search_path = _files.get_search_path("clab", pkg_path_component="templates/provider/clab") + clab_text = templates.render_template( + data=topo_copy.to_dict(), + j2_file="clab.j2", + extra_path=search_path, + ) + (Path(out_dir) / "clab.yml").write_text(clab_text) - # Generate VXLAN setup/teardown scripts for bridge tunnels + # Generate VXLAN setup/teardown scripts for multi-access bridge tunnels. + # Register CLI hooks inside this server's snapshot so 'netlab up --snapshot' + # (run from inside server-/ on the remote host) executes them automatically. + # The hooks must live in topo_copy — they fire on the remote server, not the + # control node, which only runs 'netlab create'. if vxlan_tunnels: dev = server.get("vxlan_dev", "") or vxlan_cfg.get("dev", "") if not dev: log.error( - f"Server {sid} has multi-access cross-server links but no VXLAN device is configured", + f'Server "{sname}" has multi-access cross-server links but no VXLAN device is configured', log.MissingValue, "multiserver", more_hints=["Set multiserver.vxlan.dev or multiserver.servers[].vxlan_dev"], ) continue _write_vxlan_scripts(out_dir, vxlan_tunnels, dev) - - link_count = len(clab_dict.get("topology", {}).get("links", [])) + # Auto-run the tunnel scripts via CLI hooks unless the user opted out. + # When auto_start is false the scripts are still written, but the user + # must run them manually (e.g. to stage cross-server convergence). + if vxlan_cfg.get("auto_start", True): + topo_copy.defaults.netlab.up.post_start_clab = "bash vxlan-setup.sh" + topo_copy.defaults.netlab.down.pre_stop_clab = "bash vxlan-teardown.sh" + + # Write filtered snapshot so 'netlab up --snapshot' works per-server. + # Done after the hooks above so they are baked into the snapshot. + _write_server_snapshot(topo_copy, out_dir) + + link_count = len(topo_copy.get("links", [])) vx_count = len(vxlan_tunnels) parts = [f"{len(local_nodes)} nodes", f"{link_count} links"] if vx_count: parts.append(f"{vx_count} VXLAN tunnels") - log.info(f"Server {sid}: {out_dir}/ — {', '.join(parts)}", module="multiserver") + log.info(f'Server "{sname}": {out_dir}/ — {", ".join(parts)}', module="multiserver") # Register atexit handler to copy node_files, host_vars, etc. into each server # folder after netlab writes all output files. if server_folders: import atexit - atexit.register(_distribute_files_atexit, os.getcwd(), server_folders) + copy_dirs = list(ms.get("copy_dirs", [])) + list(ms.get("extra_copy_dirs", [])) + copy_files = list(ms.get("copy_files", [])) + list(ms.get("extra_copy_files", [])) + atexit.register(_distribute_files_atexit, os.getcwd(), server_folders, copy_dirs, copy_files) -def _distribute_files_atexit(lab_folder: str, server_folders: list) -> None: - """Distribute generated files""" + +def _distribute_files_atexit(lab_folder: str, server_folders: list, copy_dirs: list, copy_files: list) -> None: + """Distribute generated files into per-server directories.""" lab_path = Path(lab_folder) nf_dir = lab_path / "node_files" hv_dir = lab_path / "host_vars" - server_names = {Path(sf).name for sf, _ in server_folders} for sf, local_nodes in server_folders: sf_path = Path(sf) if not sf_path.is_dir(): continue - # node_files: per-node dirs + shared files (names starting with -) + # node_files: per-node dirs + shared entries (names starting with -) + # Always replace to avoid stale files from a previous run. if nf_dir.is_dir(): dst_nf = sf_path / "node_files" - dst_nf.mkdir(exist_ok=True) + if dst_nf.exists(): + shutil.rmtree(dst_nf) + dst_nf.mkdir() for item in nf_dir.iterdir(): if item.name in local_nodes or item.name.startswith("-"): - _copy_if_missing(item, dst_nf / item.name) + _copy(item, dst_nf / item.name) # host_vars: per-node only if hv_dir.is_dir(): dst_hv = sf_path / "host_vars" - dst_hv.mkdir(exist_ok=True) + if dst_hv.exists(): + shutil.rmtree(dst_hv) + dst_hv.mkdir() for item in hv_dir.iterdir(): if item.name in local_nodes: - _copy_if_missing(item, dst_hv / item.name) - - # Copy all other subdirectories (e.g. group_vars, templates, monitoring) - # excluding server folders, node_files, host_vars, and python/git metadata. - skip = server_names | {"node_files", "host_vars", "__pycache__", ".git"} - for item in lab_path.iterdir(): - if not item.is_dir() or item.name in skip: - continue - # Only copy grafana directory if this server hosts the grafana node - if item.name == "grafana" and "grafana" not in local_nodes: - continue - _copy_if_missing(item, sf_path / item.name) - - # Ansible inventory and config - for fname in ("ansible.cfg", "hosts.yml"): + _copy(item, dst_hv / item.name) + + # Configurable subdirectories (group_vars, templates, …) + for dname in copy_dirs: + src = lab_path / dname + if src.is_dir(): + dst = sf_path / dname + if dst.exists(): + shutil.rmtree(dst) + shutil.copytree(src, dst) + + # Configurable top-level files (ansible.cfg, …) + for fname in copy_files: src = lab_path / fname if src.exists(): - _copy_if_missing(src, sf_path / fname) + shutil.copy2(src, sf_path / fname) + + # Ansible inventory: copy hosts.yml filtered to local nodes only + _write_filtered_inventory(lab_path / "hosts.yml", sf_path / "hosts.yml", local_nodes) -def _copy_if_missing(src: Path, dst: Path) -> None: - if dst.exists(): +def _write_filtered_inventory(src: Path, dst: Path, local_nodes: set) -> None: + """Write a hosts.yml containing only the nodes assigned to this server. + + A filtered inventory prevents 'netlab initial' on the remote server from + attempting to configure nodes that live on other servers. + """ + if not src.exists(): return + try: + with open(src) as f: + inv = yaml.safe_load(f) + if not isinstance(inv, dict): + shutil.copy2(src, dst) + return + + # Prune every 'hosts' dict to only local nodes + def _prune(group: dict) -> None: + if "hosts" in group and isinstance(group["hosts"], dict): + group["hosts"] = {k: v for k, v in group["hosts"].items() if k in local_nodes} + for child in group.get("children", {}).values(): + if isinstance(child, dict): + _prune(child) + + for grp in inv.values(): + if isinstance(grp, dict): + _prune(grp) + + with open(dst, "w") as f: + yaml.dump(inv, f, default_flow_style=False) + except Exception as e: + # Falling back to the unfiltered inventory means 'netlab initial' on the + # remote server may try to configure nodes that live on other servers, so + # make the failure visible instead of silently degrading. + log.error( + f"Could not filter Ansible inventory {src} -> {dst}: {e}", + log.IncorrectValue, + "multiserver", + more_hints=["Copied the unfiltered hosts.yml instead; it may list nodes on other servers"], + ) + shutil.copy2(src, dst) + + +def _copy(src: Path, dst: Path) -> None: try: if src.is_dir(): shutil.copytree(src, dst) else: shutil.copy2(src, dst) - except Exception: - pass + except Exception as e: + # node_files/host_vars carry the per-node configs; a silent failure here + # leaves an incomplete server directory, so surface it instead of swallowing. + log.error( + f"Could not copy {src} -> {dst}: {e}", + log.IncorrectValue, + "multiserver", + ) # =========================================================================== @@ -235,70 +304,76 @@ def _copy_if_missing(src: Path, dst: Path) -> None: # =========================================================================== -def _validate_servers(servers: list) -> None: - seen_ids: set = set() - for idx, s in enumerate(servers): +def _validate_servers(servers: Box) -> None: + # servers is a dict keyed by server name, so duplicate names are impossible by construction. + # Mixed static/auto ID assignment uses the same _dataplane pattern as VLANs/VRFs. + _dataplane.create_id_set("multiserver_server") + _dataplane.extend_id_set( + "multiserver_server", _dataplane.build_id_set(Box({"servers": servers}), "servers", "id", "multiserver") + ) + _dataplane.set_id_counter("multiserver_server", 1, max_value=65535) + + for name, s in servers.items(): if "id" not in s: - log.error(f'Server entry #{idx + 1} missing required "id" field', log.MissingValue, "multiserver") - continue + s.id = _dataplane.get_next_id("multiserver_server") if "host" not in s: - log.error(f'Server {s.id} missing required "host" field', log.MissingValue, "multiserver") - continue - if s.id in seen_ids: - log.error(f"Duplicate server id {s.id}", log.IncorrectValue, "multiserver") - seen_ids.add(s.id) + log.error(f'Server "{name}" missing required "host" field', log.MissingValue, "multiserver") def _resolve_replicated(ms: Box, topology: Box) -> set: + # Each entry's validity (node or group name) is checked by the schema + # (node_or_group subtype on multiserver.replicate). Here we only expand + # group references into their members. Called from post_transform where + # group members are already resolved. replicated: set = set() for entry in ms.get("replicate", []): if entry in topology.nodes: replicated.add(entry) - elif entry in topology.get("groups", {}): + else: for member in topology.groups[entry].get("members", []): replicated.add(member) - else: - log.error(f'multiserver.replicate: "{entry}" is not a node or group', log.IncorrectValue, "multiserver") return replicated -def _resolve_assignments(servers: list, topology: Box) -> dict: +def _resolve_assignments(servers: Box, topology: Box) -> dict: + # Group and node name existence is validated by the schema (group_id/node_id + # subtypes on multiserver.servers[].groups/.members), so we only need to expand + # references and catch double-assignment conflicts here. assignment: dict = {} - for server in servers: - for gname in server.get("groups", []): - grp = topology.get("groups", {}).get(gname, None) - if grp is None: - log.error(f'Server {server.id} references unknown group "{gname}"', log.IncorrectValue, "multiserver") - continue - for member in grp.get("members", []): - if member in assignment and assignment[member] != server.id: - log.error( - f"Node {member} assigned to both server {assignment[member]} and {server.id}", - log.IncorrectValue, - "multiserver", - ) - assignment[member] = server.id + def _assign(member: str, server: Box, sname: str) -> None: + if member in assignment and assignment[member] != server.id: + log.error( + f'Node {member} assigned to both server {assignment[member]} and "{sname}"', + log.IncorrectValue, + "multiserver", + ) + assignment[member] = server.id + + for sname, server in servers.items(): + for gname in server.get("groups", []): + for member in topology.groups[gname].get("members", []): + _assign(member, server, sname) for member in server.get("members", []): - if member not in topology.nodes: - log.error(f'Server {server.id} references unknown node "{member}"', log.IncorrectValue, "multiserver") - continue - if member in assignment and assignment[member] != server.id: - log.error( - f"Node {member} assigned to both server {assignment[member]} and {server.id}", - log.IncorrectValue, - "multiserver", - ) - assignment[member] = server.id + _assign(member, server, sname) return assignment def _auto_distribute(unassigned: set, server_map: dict, assignment: dict, topology: Box) -> None: - """Distribute unassigned nodes across servers, keeping netlab groups together.""" + """Distribute unassigned nodes across servers, keeping netlab groups together. + + Load is measured as (assigned node count) / weight, where weight defaults to 1. + A server with weight=2 absorbs twice as many nodes before being considered + "as loaded" as a server with weight=1. + """ sorted_sids = sorted(server_map.keys()) + weights = {sid: max(1, int(server_map[sid].get("weight", 1))) for sid in sorted_sids} counts = {sid: sum(1 for s in assignment.values() if s == sid) for sid in sorted_sids} + def _load(sid: int) -> float: + return counts[sid] / weights[sid] + # Build group buckets: keep group members together, distribute largest groups first claimed: set = set() group_buckets: list = [] @@ -310,14 +385,14 @@ def _auto_distribute(unassigned: set, server_map: dict, assignment: dict, topolo group_buckets.sort(key=lambda g: -len(g)) for members in group_buckets: - target = min(sorted_sids, key=lambda s: counts[s]) + target = min(sorted_sids, key=_load) for m in members: assignment[m] = target counts[target] += len(members) # Remaining ungrouped nodes: one by one to least-loaded server for name in sorted(unassigned - claimed): - target = min(sorted_sids, key=lambda s: counts[s]) + target = min(sorted_sids, key=_load) assignment[name] = target counts[target] += 1 @@ -327,11 +402,15 @@ def _classify_links(topology: Box, assignment: dict, replicated: set, vni_base: vni = vni_base for link in topology.links: link_servers = { - assignment[i.node] - for i in link.get("interfaces", []) - if i.node not in replicated and i.node in assignment + assignment[i.node] for i in link.get("interfaces", []) if i.node not in replicated and i.node in assignment } - if len(link_servers) > 1: + # A link with a physical uplink (clab.uplink) attaches to the external lab + # network on EVERY server via that server's own NIC, so the physical fabric + # already provides cross-server connectivity. Tunneling it would add a + # redundant path and an L2 loop (uplink + VXLAN on the same bridge, no STP) + # → broadcast storm. Never VXLAN uplink bridges; just keep them local. + has_uplink = bool(link.get("clab", {}).get("uplink")) + if len(link_servers) > 1 and not has_uplink: link._ms = Box({"cross": True, "vni": vni, "servers": sorted(link_servers)}) vni += 1 else: @@ -346,7 +425,7 @@ def _classify_links(topology: Box, assignment: dict, replicated: set, vni_base: def _log_assignment_summary( ms: Box, assignment: dict, replicated: set, topology: Box, vni_base: int, cross_count: int ) -> None: - for server in ms.servers: + for sname, server in ms.servers.items(): sid = server.id server_nodes = sorted(n for n, s in assignment.items() if s == sid) n = len(server_nodes) @@ -361,7 +440,7 @@ def _log_assignment_summary( if on_this and len(on_this) == len(assigned): server_groups.append(gname) - log.info(f"Server {sid} ({server.host}): {n} nodes", module="multiserver") + log.info(f'Server "{sname}" ({server.host}): {n} nodes', module="multiserver") if server_groups: preview = server_groups[:8] suffix = f" ... +{len(server_groups) - 8} more" if len(server_groups) > 8 else "" @@ -374,294 +453,96 @@ def _log_assignment_summary( if replicated: log.info(f"Replicated on all servers: {', '.join(sorted(replicated))}", module="multiserver") if cross_count: - log.info( - f"{cross_count} cross-server links (VNI {vni_base}–{vni_base + cross_count - 1})", module="multiserver" - ) + log.info(f"{cross_count} cross-server links (VNI {vni_base}–{vni_base + cross_count - 1})", module="multiserver") # =========================================================================== -# Internal helpers — clab.yml generation +# Internal helpers — per-server topology filtering # =========================================================================== -def _to_plain(obj: object) -> object: - """Convert Box/BoxList to plain dict/list for clean YAML serialization.""" - if isinstance(obj, Box): - return {k: _to_plain(v) for k, v in obj.items()} - if isinstance(obj, list): - return [_to_plain(v) for v in obj] - return obj - - -def _intf_clab_name(intf: Box) -> str: - """Containerlab interface name for a node interface.""" - return intf.get("clab", {}).get("name", "") or intf.get("ifname", "") - - -def _build_clab_node(nname: str, ndata: Box, topology: Box) -> dict: - """Reconstruct a clab.yml node entry from the transformed topology data.""" - entry: dict = {} - clab = ndata.get("clab", Box({})) - - # Management IPs - nm = clab.get("network-mode", "") - if nm != "none": - if ndata.get("mgmt", {}).get("ipv4"): - entry["mgmt-ipv4"] = str(ndata.mgmt.ipv4) - if ndata.get("mgmt", {}).get("ipv6"): - entry["mgmt-ipv6"] = str(ndata.mgmt.ipv6) - - kind = clab.get("kind", "") or ndata.get("device", "") - entry["kind"] = kind - if kind == "linux" and "restart-policy" not in clab: - entry["restart-policy"] = "no" - - # Pass through standard clab node attributes - special = set(topology.defaults.providers.clab.get("node_config_special", [])) - for attr in topology.defaults.providers.clab.get("attributes", {}).get("node", {}).get("_keys", []): - if attr in clab and attr not in special: - entry[attr] = _to_plain(clab[attr]) - - # srl-agents goes under extras: (matches clab.j2 template) - if "srl-agents" in clab: - entry["extras"] = {"srl-agents": _to_plain(clab["srl-agents"])} - - entry["image"] = str(clab.get("image", "") or ndata.get("box", "")) - entry["runtime"] = str(clab.get("runtime", "") or topology.defaults.providers.clab.get("runtime", "docker")) - - # Groups - if "groups" in topology: - groups = [g for g in topology.groups if nname in topology.groups[g].get("members", [])] - if groups: - entry["group"] = ",".join(groups) - - # Binds — keep paths as-is (relative to the server directory). - # The distribute script copies node_files/ into each server dir, - # so paths like node_files/r1/... work when running from there. - if "binds" in clab: - entry["binds"] = [] - for b in clab.binds: - bind_str = f"{b.source}:{b.target}" - if "mode" in b: - bind_str += f":{b.mode}" - entry["binds"].append(bind_str) - - # Startup config - if "startup-config" in clab: - entry["startup-config"] = str(clab["startup-config"]) - - return entry - - -def _build_server_clab(topology: Box, local_nodes: set, sid: int, server_map: dict, vxlan_cfg: Box) -> tuple: - """Build the clab.yml dict and VXLAN tunnel list for one server.""" +def _build_server_topo(topology: Box, local_nodes: set, sid: int, server_map: dict, vxlan_cfg: Box) -> tuple: + """Return (topo_copy, vxlan_tunnels) for one server. + + topo_copy is a filtered Box ready to pass to clab.j2: + - Only local nodes are kept. + - Each link that has at least one local interface is kept; remote interfaces + are pruned so node_count reflects only what this server sees. + - Cross-server P2P links get link.clab.vxlan annotated so clab.j2 renders + a native VXLAN endpoint instead of a regular veth pair. + - Cross-server bridge links keep their bridge but also produce host-level + VXLAN tunnel entries returned in vxlan_tunnels. + """ dstport = vxlan_cfg.get("dstport", 4789) - multilab_id = topology.defaults.get("multilab", {}).get("id", 0) assignment = topology._multiserver.assignment - clab: dict = { - "name": topology.name, - "prefix": str(topology.defaults.providers.clab.get("lab_prefix", "") or ""), - "mgmt": { - "network": str(topology.addressing.mgmt.get("_network", "") or "netlab_mgmt"), - "ipv4-subnet": str(topology.addressing.mgmt.get("ipv4", "172.20.20.0/24")), - }, - "topology": { - "nodes": {}, - "links": [], - }, - } - - mgmt_bridge = topology.addressing.mgmt.get("_bridge", "") - if mgmt_bridge: - clab["mgmt"]["bridge"] = str(mgmt_bridge) - if topology.defaults.addressing.mgmt.get("ipv6"): - clab["mgmt"]["ipv6-subnet"] = str(topology.defaults.addressing.mgmt.ipv6) + topo_copy = Box(topology, box_dots=True, default_box=True) + topo_copy.nodes = Box({n: v for n, v in topology.nodes.items() if n in local_nodes}, box_dots=True) - # --- Nodes --- - for nname, ndata in topology.nodes.items(): - if ndata.get("unmanaged", False): - continue - if nname in local_nodes: - clab["topology"]["nodes"][nname] = _build_clab_node(nname, ndata, topology) - - # --- Links --- - bridges_needed: set = set() vxlan_tunnels: list = [] + filtered_links = [] for link in topology.links: local_intfs = [i for i in link.get("interfaces", []) if i.node in local_nodes] if not local_intfs: continue - is_cross = link.get("_ms", {}).get("cross", False) - node_count = link.get("node_count", len(link.get("interfaces", []))) + lc = Box(link, box_dots=True, default_box=True) + is_cross = link.get("_ms.cross", False) - # ---- Uplink (macvlan) ---- - if link.get("clab", {}).get("uplink", False): - for intf in local_intfs: - clab_name = _intf_clab_name(intf) - clab["topology"]["links"].append({"endpoints": [f"{intf.node}:{clab_name}", f"macvlan:{link.clab.uplink}"]}) - continue - - # ---- Fully local link ---- if not is_cross: - _render_local_link(clab, link, local_intfs, node_count, bridges_needed, multilab_id, topology) - continue - - # ---- Cross-server P2P (clab native VXLAN) ---- - if node_count == 2: - _render_p2p_vxlan(clab, link, sid, server_map, local_intfs, assignment, dstport) + # Local link: prune interfaces to just local ones and update node_count. + lc.interfaces = local_intfs + lc.node_count = len(local_intfs) + filtered_links.append(lc) continue - # ---- Cross-server multi-access (bridge + host VXLAN) ---- - _render_bridge_vxlan( - clab, - link, - sid, - server_map, - local_intfs, - assignment, - bridges_needed, - vxlan_tunnels, - dstport, - multilab_id, - topology, - ) - - # --- Bridge nodes --- - bridge_type = str(topology.defaults.providers.clab.get("bridge_type", "bridge")) - for brname in sorted(bridges_needed): - clab["topology"]["nodes"][brname] = {"kind": bridge_type} - - if not clab["topology"]["links"]: - del clab["topology"]["links"] - - return clab, vxlan_tunnels - - -def _render_local_link( - clab: dict, link: Box, local_intfs: list, node_count: int, bridges_needed: set, multilab_id: int, topology: Box -) -> None: - """Render a fully-local link (all endpoints on the same server).""" - - # Stub link - if node_count == 1 and local_intfs: - intf = local_intfs[0] - clab["topology"]["links"].append( - { - "type": "dummy", - "endpoint": {"node": intf.node, "interface": _intf_clab_name(intf)}, - } - ) - return - - # P2P link - if node_count == 2: - endpoints = [f"{i.node}:{_intf_clab_name(i)}" for i in local_intfs] - if len(endpoints) == 2: - clab["topology"]["links"].append({"endpoints": endpoints}) - return + vni = link._ms.vni + node_count = link.get("node_count", len(link.get("interfaces", []))) - # Multi-access link (bridge) - if node_count > 2 and link.get("bridge"): - bridge = link.bridge - if not link.get("clab", {}).get("external_bridge", False): - bridges_needed.add(bridge) - for intf in local_intfs: - ndata = topology.nodes[intf.node] - bridge_intf = f"bni{multilab_id}n{ndata.id}i{intf.ifindex}" - clab["topology"]["links"].append( + if node_count == 2: + # Cross-server P2P: keep only the local interface, annotate clab.vxlan. + # clab.j2 sees node_count==2 and l.clab.vxlan defined → renders VXLAN endpoint. + remote_sid = next( + (assignment[i.node] for i in link.get("interfaces", []) if assignment.get(i.node) not in (None, sid)), + None, + ) + if remote_sid is None: + continue + lc.interfaces = local_intfs + lc.node_count = 2 + lc.clab.vxlan = Box( { - "endpoints": [ - f"{intf.node}:{_intf_clab_name(intf)}", - f"{bridge}:{bridge_intf}", - ] + "vni": vni, + "remote": str(server_map[remote_sid].host), + "dstport": dstport, } ) - - -def _render_p2p_vxlan( - clab: dict, link: Box, local_sid: int, server_map: dict, local_intfs: list, assignment: dict, dstport: int -) -> None: - """Render a P2P cross-server link as a containerlab native VXLAN endpoint.""" - if not local_intfs: - return - - vni = link._ms.vni - local_intf = local_intfs[0] - - # Find the remote server - remote_sid = None - for intf in link.get("interfaces", []): - s = assignment.get(intf.node) - if s is not None and s != local_sid: - remote_sid = s - break - - if remote_sid is None: - return - - clab_name = _intf_clab_name(local_intf) - clab["topology"]["links"].append( - { - "endpoints": [ - f"{local_intf.node}:{clab_name}", - f"host:vx{vni}", - ], - "type": "vxlan", - "remote": str(server_map[remote_sid].host), - "vni": vni, - "udp-port": dstport, - } - ) - - -def _render_bridge_vxlan( - clab: dict, - link: Box, - local_sid: int, - server_map: dict, - local_intfs: list, - assignment: dict, - bridges_needed: set, - vxlan_tunnels: list, - dstport: int, - multilab_id: int, - topology: Box, -) -> None: - """Render a multi-access cross-server link: local bridge + host VXLAN tunnels.""" - vni = link._ms.vni - bridge = link.get("bridge", f"br{link.linkindex}") - - if not link.get("clab", {}).get("external_bridge", False): - bridges_needed.add(bridge) - - # Local node-to-bridge connections - for intf in local_intfs: - ndata = topology.nodes[intf.node] - bridge_intf = f"bni{multilab_id}n{ndata.id}i{intf.ifindex}" - clab["topology"]["links"].append( - { - "endpoints": [ - f"{intf.node}:{_intf_clab_name(intf)}", - f"{bridge}:{bridge_intf}", - ] + filtered_links.append(lc) + else: + # Cross-server bridge: prune to local interfaces, let clab.j2 render the bridge. + lc.interfaces = local_intfs + lc.node_count = len(local_intfs) + filtered_links.append(lc) + + # Host-level VXLAN tunnels for the bridge (expressed as shell scripts). + bridge = link.get("bridge", f"br{link.linkindex}") + remote_sids = { + assignment[i.node] for i in link.get("interfaces", []) if assignment.get(i.node) not in (None, sid) } - ) + for rsid in sorted(remote_sids): + vxlan_tunnels.append( + { + "bridge": bridge, + "vni": vni, + "remote": str(server_map[rsid].host), + "dstport": dstport, + "remote_id": rsid, + } + ) - # VXLAN tunnels to each remote server that has endpoints on this link - remote_sids = {assignment[i.node] for i in link.get("interfaces", []) if assignment.get(i.node) not in (None, local_sid)} - for rsid in sorted(remote_sids): - vxlan_tunnels.append( - { - "bridge": bridge, - "vni": vni, - "remote": str(server_map[rsid].host), - "dstport": dstport, - "remote_id": rsid, - } - ) + topo_copy.links = filtered_links + return topo_copy, vxlan_tunnels # --------------------------------------------------------------------------- @@ -669,30 +550,86 @@ def _render_bridge_vxlan( # --------------------------------------------------------------------------- -def _write_server_snapshot(topology: Box, local_nodes: set, out_dir: str) -> None: +def _cwd_relative(entry: str) -> str: + """Mark a search-path entry as explicitly current-directory-relative. + + A bare relative dir like 'templates' is ambiguous to netlab's render_template(): + when such a dir becomes the in_folder of the template being rendered, the + path[0] not in ('.', '/') branch re-bases it onto the netlab install dir + (get_moddir() / 'templates/...'), so '{% import %}' / '{% include %}' lookups + inside that template fail. A bare relative entry also breaks find_file()'s + os.path.join() the moment netlab is run from a different cwd. make_paths_absolute() + normally fixes this during 'netlab create', but a loaded snapshot skips it, so we + anchor relative entries with './' here: render_template() then treats them as + cwd-relative (correct per-server dir) and they stay portable across hosts. + + '~' / absolute / already-'.'-prefixed entries are returned unchanged. An empty + string (from a bare 'topology:') becomes '.'. + """ + if not entry: + return "." + if entry[0] in (".", "/", "~"): + return entry + return "./" + entry + + +def _resolve_snapshot_paths(paths: Box) -> None: + """Make defaults.paths self-contained for 'netlab up --snapshot'. + + netlab's make_paths_absolute() runs in 'netlab create' but NOT when a snapshot + is loaded (load_snapshot just deserializes). Our per-server snapshot is written + in the 'output' plugin hook, which fires BEFORE that step — so without this the + snapshot has unresolved search paths and no f_files, and 'netlab initial' fails + ("list + Box", then "Cannot find ... template"). + + We resolve only what find_file() can't handle at runtime, keeping the snapshot + portable across hosts: + - files/tasks lists -> f_ template-name lists ({{ }} -> { }). + - 'package:' -> absolute install path (get_moddir); portable when netlab is + installed at the same location on every server (the usual case). + - 'topology:' / plain relative -> './' + path via _cwd_relative(), anchoring it + to the per-server working dir (templates/ is copied into each server dir). + The explicit './' is required so render_template() treats it as cwd-relative + instead of re-basing it onto the netlab install dir (see _cwd_relative). + - absolute / '~' entries are left as-is: they resolve identically on any host. + """ + moddir = _files.get_moddir() + for k in list(paths.keys()): + v = paths[k] + if (k.startswith("files") or k.startswith("tasks")) and isinstance(v, list): + paths[f"f_{k}"] = [fn.replace("{{", "{").replace("}}", "}") for fn in v] + elif isinstance(v, list): + resolved = [] + for entry in v: + if "package:" in entry: + resolved.append(str(moddir / entry.replace("package:", ""))) + elif "topology:" in entry: + resolved.append(_cwd_relative(entry.replace("topology:", ""))) + else: + resolved.append(_cwd_relative(entry)) + paths[k] = resolved + elif isinstance(v, Box): + _resolve_snapshot_paths(v) + + +def _write_server_snapshot(topo_copy: Box, out_dir: str) -> None: """Write a filtered netlab snapshot for this server's nodes only. - Allows 'netlab up --snapshot' to work from a per-server directory. - make_paths_absolute() is called here explicitly because output() hooks run - before create.py does it — without it the snapshot is missing f_files/f_tasks/f_dirs. + Allows 'netlab up --snapshot' to work from a per-server directory. Search paths + are made self-contained via _resolve_snapshot_paths() (see there for why and how + portability is preserved). """ from netsim import __version__ - from netsim.augment.config import make_paths_absolute from netsim.augment.topology import cleanup_topology - topo_copy = Box(topology, box_dots=True) - - # Filter nodes to only those on this server - topo_copy.nodes = Box({n: v for n, v in topo_copy.nodes.items() if n in local_nodes}, box_dots=True) - - # Filter links to only those with at least one local endpoint - topo_copy.links = [l for l in topo_copy.links if any(i.node in local_nodes for i in l.get("interfaces", []))] + snap = Box(topo_copy, box_dots=True) - # Expand paths (add f_files / f_tasks / f_dirs computed keys). - make_paths_absolute(topo_copy.defaults.paths) + # Snapshot load doesn't rebuild search paths / f_files, so resolve them now. + if "paths" in snap.get("defaults", {}): + _resolve_snapshot_paths(snap.defaults.paths) # Remove prefix generators and serialize - cleaned = cleanup_topology(topo_copy) + cleaned = cleanup_topology(snap) topodict = cleaned.to_dict() topodict["_netlab_version"] = __version__ @@ -702,51 +639,20 @@ def _write_server_snapshot(topology: Box, local_nodes: set, out_dir: str) -> Non def _write_vxlan_scripts(out_dir: str, tunnels: list, dev: str) -> None: """Generate bash scripts to create/destroy host-level VXLAN tunnels.""" - - setup = [ - "#!/bin/bash", - "# VXLAN tunnel setup — generated by netlab multiserver plugin", - "# Run AFTER: sudo clab deploy -t clab.yml", - "#", - "# Creates host-level VXLAN tunnels and attaches them to containerlab bridges.", - "# These tunnels carry multi-access (bridged) cross-server traffic.", - "set -e", - "", - ] - - teardown = [ - "#!/bin/bash", - "# VXLAN tunnel teardown — generated by netlab multiserver plugin", - "# Run BEFORE: sudo clab destroy -t clab.yml", - "set -e", - "", - ] - + # Deduplicate tunnels: same VNI+remote pair should only appear once seen: set = set() + unique_tunnels = [] for t in tunnels: - vx_name = f"vxlan{t['vni']}" - key = (vx_name, t["remote"]) - if key in seen: - continue - seen.add(key) - - setup.extend( - [ - f"# VNI {t['vni']} -> {t['remote']} (server {t['remote_id']}) via bridge {t['bridge']}", - f"ip link add {vx_name} type vxlan id {t['vni']} remote {t['remote']} dev {dev} dstport {t['dstport']}", - f"ip link set {vx_name} master {t['bridge']}", - f"ip link set {vx_name} up", - f'echo " {vx_name} -> {t["bridge"]} (remote {t["remote"]})"', - "", - ] - ) - - teardown.append(f'ip link del {vx_name} 2>/dev/null && echo " deleted {vx_name}" || true') - - setup.append('echo "VXLAN setup complete."') - teardown.extend(["", 'echo "VXLAN teardown complete."']) - - for name, lines in [("vxlan-setup.sh", setup), ("vxlan-teardown.sh", teardown)]: - path = Path(out_dir) / name - path.write_text("\n".join(lines) + "\n") + key = (t["vni"], t["remote"]) + if key not in seen: + seen.add(key) + unique_tunnels.append(t) + + tpl_dir = str(Path(__file__).parent) + tpl_data = {"tunnels": unique_tunnels, "dev": dev} + + for script, tpl in [("vxlan-setup.sh", "vxlan-setup.j2"), ("vxlan-teardown.sh", "vxlan-teardown.j2")]: + text = templates.render_template(data=tpl_data, j2_file=tpl, path=tpl_dir) + path = Path(out_dir) / script + path.write_text(text) os.chmod(path, 0o755) diff --git a/netsim/extra/multiserver/vxlan-setup.j2 b/netsim/extra/multiserver/vxlan-setup.j2 new file mode 100644 index 0000000000..bacafb1438 --- /dev/null +++ b/netsim/extra/multiserver/vxlan-setup.j2 @@ -0,0 +1,19 @@ +#!/bin/bash +# VXLAN tunnel setup — generated by netlab multiserver plugin +# Run AFTER: sudo clab deploy -t clab.yml +# +# Creates host-level VXLAN tunnels and attaches them to containerlab bridges. +# These tunnels carry multi-access (bridged) cross-server traffic. +set -e + +{% for t in tunnels %} +# VNI {{ t.vni }} -> {{ t.remote }} (server {{ t.remote_id }}) via bridge {{ t.bridge }} +# Remove any stale device first so a leftover tunnel can't abort the run. +ip link del vxlan{{ t.vni }} 2>/dev/null || true +ip link add vxlan{{ t.vni }} type vxlan id {{ t.vni }} remote {{ t.remote }} dev {{ dev }} dstport {{ t.dstport }} +ip link set vxlan{{ t.vni }} master {{ t.bridge }} +ip link set vxlan{{ t.vni }} up +echo " vxlan{{ t.vni }} -> {{ t.bridge }} (remote {{ t.remote }})" + +{% endfor %} +echo "VXLAN setup complete." diff --git a/netsim/extra/multiserver/vxlan-teardown.j2 b/netsim/extra/multiserver/vxlan-teardown.j2 new file mode 100644 index 0000000000..06650e5cee --- /dev/null +++ b/netsim/extra/multiserver/vxlan-teardown.j2 @@ -0,0 +1,10 @@ +#!/bin/bash +# VXLAN tunnel teardown — generated by netlab multiserver plugin +# Run BEFORE: sudo clab destroy -t clab.yml +set -e + +{% for t in tunnels %} +ip link del vxlan{{ t.vni }} 2>/dev/null && echo " deleted vxlan{{ t.vni }}" || true +{% endfor %} + +echo "VXLAN teardown complete." diff --git a/netsim/templates/provider/clab/clab.j2 b/netsim/templates/provider/clab/clab.j2 index bc08b61700..c2a2448a5d 100644 --- a/netsim/templates/provider/clab/clab.j2 +++ b/netsim/templates/provider/clab/clab.j2 @@ -98,16 +98,34 @@ topology: {% endfor %} {% endfor %} {% elif l.node_count == 2 %} +{% if l.clab is defined and l.clab.vxlan is defined %} +{# + Cross-server P2P link via containerlab native VXLAN endpoint. + Only the local interface appears in l.interfaces (remote was pruned by the multiserver plugin). +#} +{% for n in nodes.values() if not (n.unmanaged|default(False)) %} +{% for nl in n.interfaces|default([]) if nl.linkindex|default(0) == l.linkindex %} + - endpoints: + - "{{ n.name }}:{{ nl.clab.name|default(nl.ifname) }}" + - "host:vx{{ l.clab.vxlan.vni }}" + type: vxlan + remote: "{{ l.clab.vxlan.remote }}" + vni: {{ l.clab.vxlan.vni }} + udp-port: {{ l.clab.vxlan.dstport }} +{% endfor %} +{% endfor %} +{% else %} {# point-to-point link between two containers #} - endpoints: -{% for n in nodes.values() if not (n.unmanaged|default(False)) %} -{% for nl in n.interfaces|default([]) if nl.linkindex|default(0) == l.linkindex %} -{% set clab = nl.clab|default({}) %} +{% for n in nodes.values() if not (n.unmanaged|default(False)) %} +{% for nl in n.interfaces|default([]) if nl.linkindex|default(0) == l.linkindex %} +{% set clab = nl.clab|default({}) %} - "{{ n.name }}:{{ clab.name|default(nl.ifname) }}" +{% endfor %} {% endfor %} -{% endfor %} +{% endif %} {% else %} {# link with more or less than two endpoints, or a multi-provider link diff --git a/tests/topology/expected/multiserver-auto.yml b/tests/topology/expected/multiserver-auto.yml index 85559675e2..448333a71e 100644 --- a/tests/topology/expected/multiserver-auto.yml +++ b/tests/topology/expected/multiserver-auto.yml @@ -104,15 +104,23 @@ links: type: p2p multiserver: assignment: auto - output_dir: server-{server_id} + copy_dirs: + - group_vars + - templates + copy_files: + - ansible.cfg + output_dir: server-{server_name} replicate: - mon_srv servers: - - host: 192.168.128.1 - id: 1 - - host: 192.168.128.2 - id: 2 + srv1: + host: 192.168.128.1 + id: 1 + srv2: + host: 192.168.128.2 + id: 2 vxlan: + auto_start: true dev: ens33 dstport: 4789 vni_base: 10000 diff --git a/tests/topology/expected/multiserver-explicit.yml b/tests/topology/expected/multiserver-explicit.yml index 70fcb7beec..d3663d3498 100644 --- a/tests/topology/expected/multiserver-explicit.yml +++ b/tests/topology/expected/multiserver-explicit.yml @@ -1,9 +1,11 @@ _multiserver: assignment: + frr1: 1 s1_g1: 1 s1_member: 1 s2_g1: 2 s2_member: 2 + srl1: 1 replicated: [] server_map: 1: @@ -13,6 +15,8 @@ _multiserver: id: 1 members: - s1_member + - srl1 + - frr1 vxlan_dev: eth1 2: groups: @@ -53,6 +57,25 @@ links: ipv4: 10.1.0.0/30 type: p2p - _linkname: links[2] + _ms: + cross: false + servers: + - 1 + interfaces: + - ifindex: 1 + ifname: ethernet-1/1 + ipv4: 10.1.0.6/30 + node: srl1 + - ifindex: 1 + ifname: eth1 + ipv4: 10.1.0.5/30 + node: frr1 + linkindex: 2 + node_count: 2 + prefix: + ipv4: 10.1.0.4/30 + type: p2p +- _linkname: links[3] _ms: cross: false servers: @@ -60,18 +83,18 @@ links: interfaces: - ifindex: 1 ifname: Ethernet1 - ipv4: 10.1.0.5/30 + ipv4: 10.1.0.9/30 node: s2_g1 - ifindex: 1 ifname: Ethernet1 - ipv4: 10.1.0.6/30 + ipv4: 10.1.0.10/30 node: s2_member - linkindex: 2 + linkindex: 3 node_count: 2 prefix: - ipv4: 10.1.0.4/30 + ipv4: 10.1.0.8/30 type: p2p -- _linkname: links[3] +- _linkname: links[4] _ms: cross: true servers: @@ -81,25 +104,46 @@ links: interfaces: - ifindex: 2 ifname: Ethernet2 - ipv4: 10.1.0.9/30 + ipv4: 10.1.0.13/30 node: s1_g1 - ifindex: 2 ifname: Ethernet2 - ipv4: 10.1.0.10/30 + ipv4: 10.1.0.14/30 node: s2_g1 - linkindex: 3 + linkindex: 4 node_count: 2 prefix: - ipv4: 10.1.0.8/30 + ipv4: 10.1.0.12/30 type: p2p -- _linkname: links[4] +- _linkname: links[5] _ms: cross: true servers: - 1 - 2 vni: 20001 - bridge: input_4 + interfaces: + - ifindex: 2 + ifname: ethernet-1/2 + ipv4: 10.1.0.18/30 + node: srl1 + - ifindex: 3 + ifname: Ethernet3 + ipv4: 10.1.0.17/30 + node: s2_g1 + linkindex: 5 + node_count: 2 + prefix: + ipv4: 10.1.0.16/30 + type: p2p +- _linkname: links[6] + _ms: + cross: true + servers: + - 1 + - 2 + vni: 20002 + bridge: input_6 interfaces: - ifindex: 2 ifname: Ethernet2 @@ -107,42 +151,110 @@ links: node: s1_member - ifindex: 2 ifname: Ethernet2 - ipv4: 172.16.0.4/24 + ipv4: 172.16.0.6/24 node: s2_member - ifindex: 3 ifname: Ethernet3 ipv4: 172.16.0.1/24 node: s1_g1 - linkindex: 4 + linkindex: 6 node_count: 3 prefix: ipv4: 172.16.0.0/24 type: lan multiserver: assignment: explicit - output_dir: server-{server_id} + copy_dirs: + - group_vars + - templates + copy_files: + - ansible.cfg + output_dir: server-{server_name} replicate: [] servers: - - groups: - - server1_nodes - host: 192.168.128.1 - id: 1 - members: - - s1_member - vxlan_dev: eth1 - - groups: - - server2_nodes - host: 192.168.128.2 - id: 2 - members: - - s2_member - vxlan_dev: eth1 + srv1: + groups: + - server1_nodes + host: 192.168.128.1 + id: 1 + members: + - s1_member + - srl1 + - frr1 + vxlan_dev: eth1 + srv2: + groups: + - server2_nodes + host: 192.168.128.2 + id: 2 + members: + - s2_member + vxlan_dev: eth1 vxlan: + auto_start: true dev: eth1 dstport: 4789 vni_base: 20000 name: input nodes: + frr1: + _node_config: + initial: /etc/config/01-initial.sh:sh + af: + ipv4: true + box: quay.io/frrouting/frr:10.6.1 + clab: + binds: + - source: node_files/frr1/initial + target: /etc/config/01-initial.sh + - source: node_files/frr1/daemons + target: /etc/frr/daemons + - mode: ro + source: node_files/-shared-hosts + target: /etc/hosts + config_templates: + - mode: sh + source: initial + target: /etc/config/01-initial.sh + - source: daemons + target: /etc/frr/daemons + - mode: shared + source: hosts + target: /etc/hosts + exec: + - sleep 1 + kind: linux + device: frr + hostname: clab-input-frr1 + id: 4 + interfaces: + - ifindex: 1 + ifname: eth1 + ipv4: 10.1.0.5/30 + linkindex: 2 + mtu: 1500 + name: frr1 -> srl1 + neighbors: + - ifname: ethernet-1/1 + ipv4: 10.1.0.6/30 + node: srl1 + type: p2p + loopback: + ifindex: 0 + ifname: lo + ipv4: 10.0.0.4/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: eth0 + ipv4: 192.168.121.104 + mac: ca:fe:00:04:00:00 + mtu: 1500 + name: frr1 + netlab_ansible_skip_module: + - initial + role: router s1_g1: af: ipv4: true @@ -173,22 +285,22 @@ nodes: name: et2 ifindex: 2 ifname: Ethernet2 - ipv4: 10.1.0.9/30 - linkindex: 3 + ipv4: 10.1.0.13/30 + linkindex: 4 mac_address: caf0.0001.0002 name: s1_g1 -> s2_g1 neighbors: - ifname: Ethernet2 - ipv4: 10.1.0.10/30 + ipv4: 10.1.0.14/30 node: s2_g1 type: p2p - - bridge: input_4 + - bridge: input_6 clab: name: et3 ifindex: 3 ifname: Ethernet3 ipv4: 172.16.0.1/24 - linkindex: 4 + linkindex: 6 mac_address: caf0.0001.0003 name: s1_g1 -> [s1_member,s2_member] neighbors: @@ -196,7 +308,7 @@ nodes: ipv4: 172.16.0.2/24 node: s1_member - ifname: Ethernet2 - ipv4: 172.16.0.4/24 + ipv4: 172.16.0.6/24 node: s2_member type: lan loopback: @@ -238,18 +350,18 @@ nodes: ipv4: 10.1.0.1/30 node: s1_g1 type: p2p - - bridge: input_4 + - bridge: input_6 clab: name: et2 ifindex: 2 ifname: Ethernet2 ipv4: 172.16.0.2/24 - linkindex: 4 + linkindex: 6 mac_address: caf0.0002.0002 name: s1_member -> [s2_member,s1_g1] neighbors: - ifname: Ethernet2 - ipv4: 172.16.0.4/24 + ipv4: 172.16.0.6/24 node: s2_member - ifname: Ethernet3 ipv4: 172.16.0.1/24 @@ -279,45 +391,58 @@ nodes: kind: ceos device: eos hostname: clab-input-s2_g1 - id: 3 + id: 5 interfaces: - clab: name: et1 ifindex: 1 ifname: Ethernet1 - ipv4: 10.1.0.5/30 - linkindex: 2 - mac_address: caf0.0003.0001 + ipv4: 10.1.0.9/30 + linkindex: 3 + mac_address: caf0.0005.0001 name: s2_g1 -> s2_member neighbors: - ifname: Ethernet1 - ipv4: 10.1.0.6/30 + ipv4: 10.1.0.10/30 node: s2_member type: p2p - clab: name: et2 ifindex: 2 ifname: Ethernet2 - ipv4: 10.1.0.10/30 - linkindex: 3 - mac_address: caf0.0003.0002 + ipv4: 10.1.0.14/30 + linkindex: 4 + mac_address: caf0.0005.0002 name: s2_g1 -> s1_g1 neighbors: - ifname: Ethernet2 - ipv4: 10.1.0.9/30 + ipv4: 10.1.0.13/30 node: s1_g1 type: p2p + - clab: + name: et3 + ifindex: 3 + ifname: Ethernet3 + ipv4: 10.1.0.17/30 + linkindex: 5 + mac_address: caf0.0005.0003 + name: s2_g1 -> srl1 + neighbors: + - ifname: ethernet-1/2 + ipv4: 10.1.0.18/30 + node: srl1 + type: p2p loopback: ifindex: 0 ifname: Loopback0 - ipv4: 10.0.0.3/32 + ipv4: 10.0.0.5/32 neighbors: [] type: loopback virtual_interface: true mgmt: ifname: Management0 - ipv4: 192.168.121.103 - mac: ca:fe:00:03:00:00 + ipv4: 192.168.121.105 + mac: ca:fe:00:05:00:00 name: s2_g1 role: router s2_member: @@ -331,29 +456,29 @@ nodes: kind: ceos device: eos hostname: clab-input-s2_member - id: 4 + id: 6 interfaces: - clab: name: et1 ifindex: 1 ifname: Ethernet1 - ipv4: 10.1.0.6/30 - linkindex: 2 - mac_address: caf0.0004.0001 + ipv4: 10.1.0.10/30 + linkindex: 3 + mac_address: caf0.0006.0001 name: s2_member -> s2_g1 neighbors: - ifname: Ethernet1 - ipv4: 10.1.0.5/30 + ipv4: 10.1.0.9/30 node: s2_g1 type: p2p - - bridge: input_4 + - bridge: input_6 clab: name: et2 ifindex: 2 ifname: Ethernet2 - ipv4: 172.16.0.4/24 - linkindex: 4 - mac_address: caf0.0004.0002 + ipv4: 172.16.0.6/24 + linkindex: 6 + mac_address: caf0.0006.0002 name: s2_member -> [s1_member,s1_g1] neighbors: - ifname: Ethernet2 @@ -366,16 +491,72 @@ nodes: loopback: ifindex: 0 ifname: Loopback0 - ipv4: 10.0.0.4/32 + ipv4: 10.0.0.6/32 neighbors: [] type: loopback virtual_interface: true mgmt: ifname: Management0 - ipv4: 192.168.121.104 - mac: ca:fe:00:04:00:00 + ipv4: 192.168.121.106 + mac: ca:fe:00:06:00:00 name: s2_member role: router + srl1: + _srl_version: + - 26 + - 3 + af: + ipv4: true + box: ghcr.io/nokia/srlinux:26.3.2 + clab: + binds: + - source: node_files/srl1/hosts + target: /etc/hosts + config_templates: + - source: hosts + target: /etc/hosts + kind: srl + type: ixr-d2 + device: srlinux + hostname: clab-input-srl1 + id: 3 + interfaces: + - clab: + name: e1-1 + ifindex: 1 + ifname: ethernet-1/1 + ipv4: 10.1.0.6/30 + linkindex: 2 + name: srl1 ~ frr1 + neighbors: + - ifname: eth1 + ipv4: 10.1.0.5/30 + node: frr1 + type: p2p + - clab: + name: e1-2 + ifindex: 2 + ifname: ethernet-1/2 + ipv4: 10.1.0.18/30 + linkindex: 5 + name: srl1 ~ s2_g1 + neighbors: + - ifname: Ethernet3 + ipv4: 10.1.0.17/30 + node: s2_g1 + type: p2p + loopback: + ifindex: 0 + ifname: lo0.0 + ipv4: 10.0.0.3/32 + neighbors: [] + type: loopback + virtual_interface: true + mgmt: + ifname: mgmt0 + ipv4: 192.168.121.103 + mac: ca:fe:00:03:00:00 + name: srl1 plugin: - multiserver provider: clab diff --git a/tests/topology/input/multiserver-auto.yml b/tests/topology/input/multiserver-auto.yml index a4d8739117..afadacc00d 100644 --- a/tests/topology/input/multiserver-auto.yml +++ b/tests/topology/input/multiserver-auto.yml @@ -6,10 +6,10 @@ plugin: [ multiserver ] multiserver: servers: - - id: 1 - host: 192.168.128.1 - - id: 2 - host: 192.168.128.2 + srv1: + host: 192.168.128.1 + srv2: + host: 192.168.128.2 assignment: auto replicate: [ mon_srv ] diff --git a/tests/topology/input/multiserver-explicit.yml b/tests/topology/input/multiserver-explicit.yml index b50830025c..2cc1ed5f9d 100644 --- a/tests/topology/input/multiserver-explicit.yml +++ b/tests/topology/input/multiserver-explicit.yml @@ -1,21 +1,23 @@ --- -# Test explicit assignment in multiserver plugin +# Test explicit assignment in multiserver plugin. +# Covers: groups+members assignment, local/cross-server P2P/bridge links, +# mixed device kinds (eos, srlinux, frr). # provider: clab plugin: [ multiserver ] multiserver: servers: - - id: 1 - host: 192.168.128.1 - groups: [ server1_nodes ] - members: [ s1_member ] - vxlan_dev: eth1 - - id: 2 - host: 192.168.128.2 - groups: [ server2_nodes ] - members: [ s2_member ] - vxlan_dev: eth1 + srv1: + host: 192.168.128.1 + groups: [ server1_nodes ] + members: [ s1_member, srl1, frr1 ] + vxlan_dev: eth1 + srv2: + host: 192.168.128.2 + groups: [ server2_nodes ] + members: [ s2_member ] + vxlan_dev: eth1 assignment: explicit vxlan: vni_base: 20000 @@ -33,21 +35,29 @@ nodes: device: eos s1_member: device: eos + srl1: + device: srlinux + frr1: + device: frr s2_g1: device: eos s2_member: device: eos links: - # Local link on server 1 + # Local links on server 1 - s1_g1: s1_member: +- srl1: + frr1: # Local link on server 2 - s2_g1: s2_member: - # Cross-server P2P link + # Cross-server P2P links - s1_g1: s2_g1: +- srl1: + s2_g1: # Cross-server multi-access bridge link (3+ endpoints across servers) - s1_member: s2_member: From a0b83a1ee41d96c36bf73ac632d6499bd4f78765 Mon Sep 17 00:00:00 2001 From: Muddyblack Date: Thu, 4 Jun 2026 20:54:33 +0200 Subject: [PATCH 6/8] adapt for upcoming plugin CLI hooks --- netsim/extra/multiserver/plugin.py | 135 +++++++++++------------------ 1 file changed, 52 insertions(+), 83 deletions(-) diff --git a/netsim/extra/multiserver/plugin.py b/netsim/extra/multiserver/plugin.py index 7bb4e1570f..faffc3389e 100644 --- a/netsim/extra/multiserver/plugin.py +++ b/netsim/extra/multiserver/plugin.py @@ -13,8 +13,13 @@ import yaml from box import Box +from netsim import __version__ +from netsim.augment.config import make_paths_absolute +from netsim.augment.topology import cleanup_topology +from netsim.cli.external_commands import run_command from netsim.data import append_to_list from netsim.modules import _dataplane +from netsim.outputs import _TopologyOutput from netsim.utils import files as _files from netsim.utils import log, templates @@ -46,6 +51,9 @@ def init(topology: Box) -> None: # Register the output hook so netlab create calls our output() function append_to_list(topology.defaults.netlab.create, "plugin", "multiserver") + append_to_list(topology.defaults.netlab.up, "plugin", "multiserver") + append_to_list(topology.defaults.netlab.down, "plugin", "multiserver") + # --------------------------------------------------------------------------- @@ -162,17 +170,10 @@ def output(topology: Box) -> None: ) continue _write_vxlan_scripts(out_dir, vxlan_tunnels, dev) - # Auto-run the tunnel scripts via CLI hooks unless the user opted out. - # When auto_start is false the scripts are still written, but the user - # must run them manually (e.g. to stage cross-server convergence). - if vxlan_cfg.get("auto_start", True): - topo_copy.defaults.netlab.up.post_start_clab = "bash vxlan-setup.sh" - topo_copy.defaults.netlab.down.pre_stop_clab = "bash vxlan-teardown.sh" - # Write filtered snapshot so 'netlab up --snapshot' works per-server. - # Done after the hooks above so they are baked into the snapshot. _write_server_snapshot(topo_copy, out_dir) + link_count = len(topo_copy.get("links", [])) vx_count = len(vxlan_tunnels) parts = [f"{len(local_nodes)} nodes", f"{link_count} links"] @@ -550,84 +551,10 @@ def _build_server_topo(topology: Box, local_nodes: set, sid: int, server_map: di # --------------------------------------------------------------------------- -def _cwd_relative(entry: str) -> str: - """Mark a search-path entry as explicitly current-directory-relative. - - A bare relative dir like 'templates' is ambiguous to netlab's render_template(): - when such a dir becomes the in_folder of the template being rendered, the - path[0] not in ('.', '/') branch re-bases it onto the netlab install dir - (get_moddir() / 'templates/...'), so '{% import %}' / '{% include %}' lookups - inside that template fail. A bare relative entry also breaks find_file()'s - os.path.join() the moment netlab is run from a different cwd. make_paths_absolute() - normally fixes this during 'netlab create', but a loaded snapshot skips it, so we - anchor relative entries with './' here: render_template() then treats them as - cwd-relative (correct per-server dir) and they stay portable across hosts. - - '~' / absolute / already-'.'-prefixed entries are returned unchanged. An empty - string (from a bare 'topology:') becomes '.'. - """ - if not entry: - return "." - if entry[0] in (".", "/", "~"): - return entry - return "./" + entry - - -def _resolve_snapshot_paths(paths: Box) -> None: - """Make defaults.paths self-contained for 'netlab up --snapshot'. - - netlab's make_paths_absolute() runs in 'netlab create' but NOT when a snapshot - is loaded (load_snapshot just deserializes). Our per-server snapshot is written - in the 'output' plugin hook, which fires BEFORE that step — so without this the - snapshot has unresolved search paths and no f_files, and 'netlab initial' fails - ("list + Box", then "Cannot find ... template"). - - We resolve only what find_file() can't handle at runtime, keeping the snapshot - portable across hosts: - - files/tasks lists -> f_ template-name lists ({{ }} -> { }). - - 'package:' -> absolute install path (get_moddir); portable when netlab is - installed at the same location on every server (the usual case). - - 'topology:' / plain relative -> './' + path via _cwd_relative(), anchoring it - to the per-server working dir (templates/ is copied into each server dir). - The explicit './' is required so render_template() treats it as cwd-relative - instead of re-basing it onto the netlab install dir (see _cwd_relative). - - absolute / '~' entries are left as-is: they resolve identically on any host. - """ - moddir = _files.get_moddir() - for k in list(paths.keys()): - v = paths[k] - if (k.startswith("files") or k.startswith("tasks")) and isinstance(v, list): - paths[f"f_{k}"] = [fn.replace("{{", "{").replace("}}", "}") for fn in v] - elif isinstance(v, list): - resolved = [] - for entry in v: - if "package:" in entry: - resolved.append(str(moddir / entry.replace("package:", ""))) - elif "topology:" in entry: - resolved.append(_cwd_relative(entry.replace("topology:", ""))) - else: - resolved.append(_cwd_relative(entry)) - paths[k] = resolved - elif isinstance(v, Box): - _resolve_snapshot_paths(v) - - def _write_server_snapshot(topo_copy: Box, out_dir: str) -> None: - """Write a filtered netlab snapshot for this server's nodes only. - - Allows 'netlab up --snapshot' to work from a per-server directory. Search paths - are made self-contained via _resolve_snapshot_paths() (see there for why and how - portability is preserved). - """ - from netsim import __version__ - from netsim.augment.topology import cleanup_topology - + """Write a filtered netlab snapshot for this server's nodes only.""" snap = Box(topo_copy, box_dots=True) - # Snapshot load doesn't rebuild search paths / f_files, so resolve them now. - if "paths" in snap.get("defaults", {}): - _resolve_snapshot_paths(snap.defaults.paths) - # Remove prefix generators and serialize cleaned = cleanup_topology(snap) topodict = cleaned.to_dict() @@ -637,6 +564,48 @@ def _write_server_snapshot(topo_copy: Box, out_dir: str) -> None: pickle.dump(topodict, f) +def pre_shell_pre_up(topology: Box) -> None: + """Run early on the remote host when 'netlab up' starts from a snapshot. + + Resolves search paths to local absolute paths, then updates the snapshot + pickle and Ansible inventory files to use them. + """ + if "paths" in topology.get("defaults", {}): + make_paths_absolute(topology.defaults.paths) + + # Re-write the updated snapshot to the current directory (which is where we started) + _write_server_snapshot(topology, ".") + + # Re-create the Ansible inventory to populate group_vars with the local paths + ansible_settings = topology.defaults.outputs.get("ansible", Box({})) + output_module = _TopologyOutput.load("ansible", ansible_settings) + if output_module: + output_module.write(topology) + + + +def pre_shell_post_start_lab(topology: Box) -> None: + """Post start lab hook: run VXLAN setup script if auto_start is enabled.""" + ms = topology.get("multiserver", None) + if ms and not ms.vxlan.get("auto_start", True): + return + + if os.path.exists("vxlan-setup.sh"): + run_command("bash vxlan-setup.sh") + + +def pre_shell_pre_stop_lab(topology: Box) -> None: + """Pre stop lab hook: run VXLAN teardown script if auto_start is enabled.""" + ms = topology.get("multiserver", None) + if ms and not ms.vxlan.get("auto_start", True): + return + + if os.path.exists("vxlan-teardown.sh"): + run_command("bash vxlan-teardown.sh") + + + + def _write_vxlan_scripts(out_dir: str, tunnels: list, dev: str) -> None: """Generate bash scripts to create/destroy host-level VXLAN tunnels.""" # Deduplicate tunnels: same VNI+remote pair should only appear once From fa663fdf0eb0ddc2c88d76a87f3c5647c7cabad8 Mon Sep 17 00:00:00 2001 From: Muddyblack Date: Thu, 4 Jun 2026 20:55:32 +0200 Subject: [PATCH 7/8] ruff format --- netsim/extra/multiserver/plugin.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/netsim/extra/multiserver/plugin.py b/netsim/extra/multiserver/plugin.py index faffc3389e..9681b92ab7 100644 --- a/netsim/extra/multiserver/plugin.py +++ b/netsim/extra/multiserver/plugin.py @@ -55,7 +55,6 @@ def init(topology: Box) -> None: append_to_list(topology.defaults.netlab.down, "plugin", "multiserver") - # --------------------------------------------------------------------------- # Hook: post_transform — resolve server assignments, classify links # --------------------------------------------------------------------------- @@ -173,7 +172,6 @@ def output(topology: Box) -> None: # Write filtered snapshot so 'netlab up --snapshot' works per-server. _write_server_snapshot(topo_copy, out_dir) - link_count = len(topo_copy.get("links", [])) vx_count = len(vxlan_tunnels) parts = [f"{len(local_nodes)} nodes", f"{link_count} links"] @@ -583,7 +581,6 @@ def pre_shell_pre_up(topology: Box) -> None: output_module.write(topology) - def pre_shell_post_start_lab(topology: Box) -> None: """Post start lab hook: run VXLAN setup script if auto_start is enabled.""" ms = topology.get("multiserver", None) @@ -604,8 +601,6 @@ def pre_shell_pre_stop_lab(topology: Box) -> None: run_command("bash vxlan-teardown.sh") - - def _write_vxlan_scripts(out_dir: str, tunnels: list, dev: str) -> None: """Generate bash scripts to create/destroy host-level VXLAN tunnels.""" # Deduplicate tunnels: same VNI+remote pair should only appear once From 17089b7b5346796a9d606652a32de9de45a417f8 Mon Sep 17 00:00:00 2001 From: Muddyblack Date: Fri, 5 Jun 2026 09:39:54 +0200 Subject: [PATCH 8/8] typo fix --- netsim/extra/multiserver/plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netsim/extra/multiserver/plugin.py b/netsim/extra/multiserver/plugin.py index 9681b92ab7..2d5fc6090b 100644 --- a/netsim/extra/multiserver/plugin.py +++ b/netsim/extra/multiserver/plugin.py @@ -562,7 +562,7 @@ def _write_server_snapshot(topo_copy: Box, out_dir: str) -> None: pickle.dump(topodict, f) -def pre_shell_pre_up(topology: Box) -> None: +def pre_shell_pre_probe(topology: Box) -> None: """Run early on the remote host when 'netlab up' starts from a snapshot. Resolves search paths to local absolute paths, then updates the snapshot