diff --git a/--gui b/--gui new file mode 100644 index 000000000..e69de29bb diff --git a/--num-robots b/--num-robots new file mode 100644 index 000000000..e69de29bb diff --git a/--sim b/--sim new file mode 100644 index 000000000..e69de29bb diff --git a/--stress-iterations b/--stress-iterations new file mode 100644 index 000000000..e69de29bb diff --git a/--trajectory-types b/--trajectory-types new file mode 100644 index 000000000..e69de29bb diff --git a/-v b/-v new file mode 100644 index 000000000..fa52f4143 --- /dev/null +++ b/-v @@ -0,0 +1,11 @@ +access control disabled, clients can connect from any host +============================= test session starts ============================== +platform linux -- Python 3.12.13, pytest-9.0.3, pluggy-1.6.0 -- /usr/local/bin/python3.12 +cachedir: /tmp/.pytest_cache +rootdir: /home/pranavkumara/Desktop/AirStack/tests +configfile: pytest.ini +plugins: dependency-0.6.1, timeout-2.4.0 +collecting ... collected 0 items + +- generated xml file: /home/pranavkumara/Desktop/AirStack/tests/results/2026-05-28_14-14-04/results.xml - +============================ no tests ran in 0.00s ============================= diff --git a/.agents/skills/add-ros2-package/SKILL.md b/.agents/skills/add-ros2-package/SKILL.md index a9b7072eb..4f0db5f8b 100644 --- a/.agents/skills/add-ros2-package/SKILL.md +++ b/.agents/skills/add-ros2-package/SKILL.md @@ -28,7 +28,7 @@ Determine where the package should live based on its function: **Local Layer:** - Local planner: `robot/ros_ws/src/local/planners/` -- Local controller: `robot/ros_ws/src/local/c_controls/` +- Local controller: `robot/ros_ws/src/local/controls/` - Local world model: `robot/ros_ws/src/local/world_models/` **Global Layer:** @@ -508,7 +508,7 @@ After creating the package: - **AirStack Examples:** - Reference planner: `robot/ros_ws/src/local/planners/droan_local_planner` - - Reference controller: `robot/ros_ws/src/local/c_controls/trajectory_controller` + - Reference controller: `robot/ros_ws/src/local/controls/trajectory_controller` - Package template: `assets/package_template/` - **Next Skills:** diff --git a/.agents/skills/add-task-executor/SKILL.md b/.agents/skills/add-task-executor/SKILL.md index bf9bb6983..ab91e6529 100644 --- a/.agents/skills/add-task-executor/SKILL.md +++ b/.agents/skills/add-task-executor/SKILL.md @@ -33,10 +33,12 @@ Examples: coverage survey, object search, object counting, semantic search, fixe |-------------|---------| | `ExplorationTask.action` | Random or systematic area exploration | | `CoverageTask.action` | Systematic lawnmower-pattern coverage survey | -| `ObjectSearchTask.action` | Finding instances of a named object class | -| `ObjectCountingTask.action` | Counting all instances of an object class in an area | +| `NavigateTask.action` | Navigating to a target pose | +| `TakeoffTask.action` | Vertical takeoff to a target altitude | +| `LandTask.action` | Controlled landing | | `FixedTrajectoryTask.action` | Following a pre-defined waypoint trajectory | | `SemanticSearchTask.action` | Finding a location described in natural language | +| `ChatTask.action` | Natural-language chat-driven task | If none of these fits, add a new `.action` file to `task_msgs` following the same pattern (see Step 0 below). @@ -58,7 +60,7 @@ float32 max_flight_speed float32 time_limit_sec # 0 = no limit # ... task-specific fields --- -# Result — returned when task completes or is cancelled +# Result — returned when task completes or is canceled bool success string message # ... task-specific result fields @@ -181,7 +183,7 @@ void execute(std::shared_ptr goal_handle) if (cancel_requested_) { auto result = std::make_shared(); result->success = false; - result->message = "Task cancelled"; + result->message = "Task canceled"; task_active_ = false; goal_handle->canceled(result); return; diff --git a/.agents/skills/attach-gossip-payload/SKILL.md b/.agents/skills/attach-gossip-payload/SKILL.md new file mode 100644 index 000000000..0e726648b --- /dev/null +++ b/.agents/skills/attach-gossip-payload/SKILL.md @@ -0,0 +1,240 @@ +# Skill: Attach Custom Payload to PeerProfile (Gossip Protocol) + +## When to use +When you want to broadcast any ROS message to all peer robots via the gossip +protocol — for example, a frontier map, sensor summary, or task status. + +## Background + +Each robot runs a `gossip_node` that periodically broadcasts a `PeerProfile` +to all other robots on the gossip domain (default domain 99). The profile +carries structured fields (GPS, heading, waypoint) plus an open-ended +`payloads` array of serialized ROS messages. + +**Key files:** +| File | Purpose | +|------|---------| +| `common/ros_packages/coordination/coordination_bringup/config/gossip_payloads.yaml` | Lists topics to attach as payloads — **edit this to add payloads** | +| `coordination_bringup/coordination_bringup/gossip_node.py` | Reads config, subscribes, attaches payloads on each publish tick | +| `coordination_bringup/coordination_bringup/peer_profile.py` | `PeerProfile` helper class with `add_payload` / `get_payload` API | +| `coordination_msgs/msg/PeerProfile.msg` | Wire format — `payloads` is `PeerProfilePayload[]` | +| `coordination_msgs/msg/PeerProfilePayload.msg` | `string payload_type` + `uint8[] payload_data` | + +## How to add a payload (config-driven — no code changes) + +### Step 1 — Edit `gossip_payloads.yaml` + +```yaml +payload_topics: + # existing entries ... + + # Your new payload: + - topic: "/{robot_name}/your/topic" + type: "your_msgs/msg/YourType" +``` + +- `{robot_name}` is automatically substituted at runtime (e.g. → `/robot_1/your/topic`) +- If the topic hasn't published yet, the payload is silently skipped — no crash +- `type` must be the fully-qualified ROS 2 type string + +### Step 2 — Rebuild and restart gossip_node + +```bash +bws --packages-select coordination_bringup +ros2 launch coordination_bringup gossip.launch.xml +``` + +### Step 3 — Verify + +Check that the payload is being attached: +```bash +ros2 topic echo /gossip/peers --field payloads +# should show entries with your payload_type string +``` + +Or use the registry monitor: +```bash +ros2 run coordination_bringup peer_registry_monitor +# shows payload_types per peer +``` + +## How to read a payload on the receiving side + +```python +from coordination_bringup.peer_profile import PeerProfile + +def on_peer_msg(self, msg): + profile = PeerProfile.from_ros_msg(msg) + + # Get a specific payload by type string + rays = profile.get_payload("visualization_msgs/msg/MarkerArray") + if rays is not None: + # use rays as visualization_msgs/msg/MarkerArray + pass + + # List all payload types present + print(profile.payload_types()) + + # Get all payloads deserialized + for payload in profile.get_all_payloads(): + print(type(payload)) +``` + +## Step 4 — Add GCS visualization + +After adding a payload to `gossip_payloads.yaml`, add a handler so it appears in +Foxglove. Each payload is published to its own topic: +`/gcs/payload/{robot_name}/{payload_name}` + +This means Foxglove exposes full visualization controls (point size, color mapping, +marker type, etc.) for each payload independently. + +**File:** `gcs/ros_ws/src/gcs_visualizer/gcs_visualizer/payload_visualizer_node.py` + +### 4a — Read the payload type from `gossip_payloads.yaml` + +Open `common/ros_packages/coordination/coordination_bringup/config/gossip_payloads.yaml` +and note the `type:` field for your new entry. This determines how to deserialize it. + +If your type is **unique** (not already in `PAYLOAD_HANDLERS`), go to step 4b. +### 4b — Add handler and register in `PAYLOAD_HANDLERS` + +`PAYLOAD_HANDLERS` is keyed by **payload name** (the last segment of the topic path +in `gossip_payloads.yaml`). This means multiple payloads of the same ROS type work +without any special casing. + +Add a handler and register it: + +```python +PAYLOAD_HANDLERS = { + 'filtered_rays': ('visualization_msgs/msg/MarkerArray', _handle_filtered_rays), + 'raw_frontiers': ('sensor_msgs/msg/PointCloud2', _handle_raw_frontiers), + 'voxel_rgb': ('sensor_msgs/msg/PointCloud2', _handle_rgb_voxels), + 'navigation_mode': ('std_msgs/msg/String', _handle_navigation_mode), + 'your_name': ('your_msgs/msg/YourType', _handle_your_payload), # ← add +} +``` + +The key `'your_name'` must match the last path segment of the topic in `gossip_payloads.yaml`. +For example, `/{robot_name}/rayfronts/voxel_rgb` → key is `voxel_rgb`. + +Handler signature — all handlers must match exactly: +```python +def _handle_your_payload(self, robot_name, msg, i, now): + # msg — deserialized ROS message (already in global ENU / 'map' frame) + # i — stable robot index (use for marker IDs: i * 100000 + unique_offset) + # now — current ROS timestamp (builtin_interfaces/Time) + # transform and publish to the payload's dedicated topic: + self._pub_for(f'/gcs/payload/{robot_name}/your_name', YourMsgType).publish(out) +``` + +### 4c — Visualization options + +For `PointCloud2` payloads, choose one approach: + +**Default:** Publish as raw `PointCloud2` — Foxglove GUI controls point size, shape, and color. No extra code needed. + +**Preconfigured shape/size:** Convert to a `CUBE_LIST` `MarkerArray` in the handler (see `voxel_rgb` for a real example). Use this when you want a fixed visual style regardless of user layout settings: + +```python +from gcs_visualizer.gcs_utils import point_cloud2_to_cube_marker + +def _handle_your_payload(self, robot_name, msg, i, now): + marker = point_cloud2_to_cube_marker( + msg, 0.0, 0.0, self._display_z_offset(), + ns=f'{robot_name}_your_name', + marker_id=i * 100000, + stamp=now, + lifetime=Duration(sec=2, nanosec=0), + fallback_color=None, # uses per-point rgb field; set to (r, g, b, a) for a solid color + scale=0.5, # cube size in meters + ) + if marker is not None: + out = MarkerArray() + out.markers.append(marker) + self._pub_for(f'/gcs/payload/{robot_name}/your_name', MarkerArray).publish(out) +``` + +### 4d — Available transform helpers (`gcs_utils.py`) + +Check `gcs/ros_ws/src/gcs_visualizer/gcs_visualizer/gcs_utils.py` before writing +transform logic. Add a new helper there if none fits. + +| Helper | Use for | +|--------|---------| +| `transform_marker_array(ma, bx, by, bz)` | `MarkerArray` → translated `MarkerArray` | +| `transform_point_cloud2(cloud, bx, by, bz)` | `PointCloud2` → translated `PointCloud2` (preserves all fields including `rgb`) | +| `point_cloud2_to_cube_marker(cloud, bx, by, bz, ns, marker_id, stamp, lifetime, scale)` | `PointCloud2` → `CUBE_LIST` Marker with fixed voxel size and per-point RGB | + +### 4e — Rebuild GCS + +```bash +docker exec airstack-gcs-1 bash -c "bws --packages-select gcs_visualizer && sws" +``` + +Verify topics exist: +```bash +docker exec airstack-gcs-1 bash -c "ros2 topic list | grep /gcs/payload" +``` + +--- + +## Current payloads + +| Topic in `gossip_payloads.yaml` | Type | GCS topic | Foxglove controls | +|--------------------------------|------|-----------|-------------------| +| `/{robot_name}/filtered_rays` | `visualization_msgs/msg/MarkerArray` | `/gcs/payload/{robot}/filtered_rays` | Fixed (MarkerArray) | +| `/{robot_name}/raw_frontiers` | `sensor_msgs/msg/PointCloud2` | `/gcs/payload/{robot}/raw_frontiers` | Full (raw PointCloud2) | +| `/{robot_name}/rayfronts/voxel_rgb` | `sensor_msgs/msg/PointCloud2` | `/gcs/payload/{robot}/voxel_rgb` | Fixed (CUBE_LIST MarkerArray, 0.5 m) | +| `/{robot_name}/navigation_mode` | `std_msgs/msg/String` | `/gcs/payload/{robot}/navigation_mode` | Status string | + +## Architecture notes + +- `gossip_node` runs on the **robot's domain** (e.g. domain 1 for `robot_1`) + and can subscribe directly to any topic on that domain, including Rayfronts +- The gossip DDS Router bridges `/gossip/peers` to the shared gossip domain + (default 99) — the payload bytes travel inside the PeerProfile message, + so payload topics do **not** need their own DDS router entries +- Payloads are re-serialized every publish tick from the latest cached message; + stale data is never cleared between ticks (latest-wins per topic) +- Payload size matters for gossip bandwidth — avoid attaching large point clouds + at high rates; 1 Hz (the default gossip rate) is usually fine + +## Message deduplication + +Every `PeerProfile` message is identified by the triple: + +``` +(robot_name, gps_fix.header.stamp.sec, gps_fix.header.stamp.nanosec) +``` + +The stamp is set **at publish time** by the originating robot. Each receiver +maintains a seen-set (size 50, FIFO eviction) and drops any message whose ID +has already been processed. + +**Expected behavior:** every drone will forward/receive a message at least +once — this is intentional. The seen-set prevents infinite re-processing but +does not prevent the initial fan-out that comes from all robots being on the +same shared DDS domain. + +**Relay fields (reserved for future use):** +- `uint8 source` — `SOURCE_DIRECT (0)` or `SOURCE_RELAYED (1)` +- `uint8 relay_hops` — number of hops the message has traversed + +These fields exist in the wire format and Python API but relay logic is not +yet implemented. The seen-set deduplication is already wired to handle it +correctly when relay is activated. + +## Registry behavior + +- Each robot keeps a **per-robot inbox** (latest message per peer, drained at + 5 Hz) and a **global registry** (latest-wins, monotonic per robot timestamp) +- Registry entries are **never evicted** — a crashed robot stays visible + indefinitely until the node is restarted +- The registry is published to `/{robot_name}/coordination/peer_registry` with + RELIABLE + TRANSIENT_LOCAL QoS so late-joining nodes get the full snapshot + +## QoS note + +Payload subscriptions use `GOSSIP_QOS` (BEST_EFFORT, KEEP_LAST 1). If your +source topic uses RELIABLE QoS you may need to adjust — see `gossip_node.py`. diff --git a/.agents/skills/bump-version-and-release/SKILL.md b/.agents/skills/bump-version-and-release/SKILL.md new file mode 100644 index 000000000..3c056b774 --- /dev/null +++ b/.agents/skills/bump-version-and-release/SKILL.md @@ -0,0 +1,262 @@ +--- +name: bump-version-and-release +description: Bump the AirStack VERSION in .env (semver) before merging a PR that changes Docker image content, and update CHANGELOG. Required to pass the check-version-increment gate and to trigger the docker-build release workflow. +license: Apache-2.0 +metadata: + author: AirLab CMU + repository: AirStack +--- + +# Skill: Bump VERSION and Cut a Release + +## When to Use + +Bump the `VERSION=` line in `/.env` whenever a pull request **materially changes the contents of any Docker image** that AirStack publishes. Skipping the bump will fail the `check-version-increment` workflow on every PR — this is the single most common preventable CI failure for new contributors. + +Bump VERSION when the PR touches: + +- Any `Dockerfile` under `robot/`, `simulation/isaac-sim/`, `simulation/ms-airsim/`, `gcs/`, `common/`, or `tests/docker/` +- `docker-compose.yaml` or any included sub-compose file (when the change affects what is built or installed into images) +- Code that is **baked into** an image (i.e., copied during build, not bind-mounted at runtime). For `DOCKER_IMAGE_BUILD_MODE="prebuilt"` this includes `robot/ros_ws/src/**`. For `DOCKER_IMAGE_BUILD_MODE="dev"` (the current default in `.env`) the workspace is bind-mounted, so source-only changes there do not strictly require a rebuild — but bumping is still safer if you are unsure. +- Apt packages, pip requirements, ROS package manifests installed during the build +- Entry-point scripts, tmux configs, or `.bashrc` snippets copied into images +- Submodule pointer updates that affect image contents + +If in doubt, bump. The CI gate enforces a strict increment vs. the base branch — you cannot land a PR with the same VERSION as the base. + +## When NOT to Bump + +The version-increment check still runs on every PR, but the **only** thing it requires is that VERSION be valid semver and strictly greater than the base. For documentation-only PRs you have two options: + +- **Preferred:** still bump the patch (or pre-release counter) by one. It costs nothing, keeps the gate happy, and the docker-build only fires on push to `main`/`develop` *and* a VERSION change, so an extra alpha bump on a docs PR is cheap. +- **If you really want to avoid a rebuild:** the docker-build workflow only triggers when `.env` is in the changed paths AND `VERSION=` differs from `HEAD~1`. So a docs-only PR that does not touch `.env` will not rebuild — but the PR will still fail the increment check unless you bump. There is no clean way to "opt out" of the check; the simplest path is to bump the pre-release counter. + +Do **not** bump for: + +- Comment-only changes to `.env` +- CI workflow tweaks that do not affect what goes in the images +- Test-only edits under `tests/` (unless you also touched `tests/docker/`) + +If you change *only* docs but the gate still fails because base advanced past you, rebase and bump again. + +## How VERSION Drives the Build Pipeline + +Three workflows in `.github/workflows/` interact with `VERSION`: + +### 1. `check-version-increment.yml` — the PR gate + +- **Trigger:** every `pull_request`. +- **Logic:** runs a Python script that reads `VERSION=` from the PR's `.env`, reads the same line from `origin/:.env`, and validates: + - PR version matches the regex `^(\d+)\.(\d+)\.(\d+)(?:-(alpha|beta|rc)\.(\d+))?$` + - PR version is **strictly greater than** base version, with pre-release ordering `alpha < beta < rc < (no suffix / release)` +- **Accepted formats** (from the workflow's own error message): + ``` + MAJOR.MINOR.PATCH (e.g. 1.2.3) + MAJOR.MINOR.PATCH-alpha.N (e.g. 1.3.0-alpha.1) + MAJOR.MINOR.PATCH-beta.N (e.g. 1.3.0-beta.2) + MAJOR.MINOR.PATCH-rc.N (e.g. 1.3.0-rc.3) + ``` +- **Rejected:** `1.2`, `1.2.3-rc1` (no dot before N), `1.2.3-dev`, `1.2.3+meta`, `v1.2.3`, anything with build metadata. +- **Comparison tuple:** `(major, minor, patch, pre_rank, pre_num)` where `pre_rank = alpha:0, beta:1, rc:2, release:3`. So `1.3.0-alpha.5 < 1.3.0-beta.1 < 1.3.0-rc.1 < 1.3.0`. A release version always sorts above any pre-release of the same `MAJOR.MINOR.PATCH`. + +### 2. `docker-build.yml` — the publish trigger + +- **Trigger:** push to `main` or `develop` whose changed paths include `.env`, **and** the `VERSION=` line in `.env` differs from the previous commit. Also runs on manual `workflow_dispatch`. +- **Behavior on tag change:** + 1. Runs on a self-hosted ephemeral GPU runner (`[self-hosted, airstack-ephemeral]`). + 2. `docker compose build` for profiles `desktop,isaac-sim,ms-airsim`. + 3. `docker compose push` to `${PROJECT_DOCKER_REGISTRY}` (set in `.env` — currently `airlab-docker.andrew.cmu.edu/airstack`). + 4. Keyless `cosign sign` of every pushed image digest via GitHub OIDC. + 5. `cosign verify` against the workflow's certificate identity. +- **Skip behavior:** if the merge commit on `main`/`develop` does not actually change `VERSION=`, the build job is skipped (the check-changes job sets `tag-changed=false`). + +### 3. `deploy_docs_from_release.yaml` — versioned docs + +- **Trigger:** GitHub `release` event with `types: [published]`. +- **Behavior:** runs `mike deploy --push --update-aliases latest`, publishing the docs site under the release tag and pointing the `latest` alias at it. +- Companion workflows publish unversioned docs from `main` (default alias `main`) and `develop` (alias `develop`). + +So the full release path is: bump `VERSION` → PR → merge to `main`/`develop` (rebuild + push + sign) → cut a GitHub Release matching that VERSION (versioned docs go live). + +## Choosing the Bump Type + +Use this decision tree on the **current** version (currently `0.18.0-alpha.7`): + +``` +Is this a breaking API/topic/interface change? +├── yes → bump MAJOR, reset MINOR=0, PATCH=0 (e.g. 0.18.0-alpha.7 → 1.0.0-alpha.1 if pre-1.0) +└── no + ├── New feature / new module / new Docker image content? + │ └── yes → bump MINOR, reset PATCH=0 (e.g. 0.18.0 → 0.19.0) + └── Bug fix / small tweak / dependency bump? + └── yes → bump PATCH (e.g. 0.18.0 → 0.18.1) + +Are you mid-cycle on a pre-release line (suffix present)? +├── Same line, more iteration → increment N (0.18.0-alpha.7 → 0.18.0-alpha.8) +├── Promoting alpha → beta → reset N to 1 (0.18.0-alpha.7 → 0.18.0-beta.1) +├── Promoting beta → rc → reset N to 1 (0.18.0-beta.4 → 0.18.0-rc.1) +└── Promoting rc → release → drop suffix (0.18.0-rc.3 → 0.18.0) +``` + +Notes: + +- AirStack is pre-1.0; many "breaking" changes still bump MINOR rather than MAJOR. Use judgment, and prefer pre-release suffixes (`-alpha.N`) for the active development line so feature PRs do not have to fight over MINOR numbers. +- The current pattern in git history is per-PR alpha bumps on the development line and a final un-suffixed bump at release time (e.g. `0.16.1-rc → 0.16.1`, `0.17.0-rc1 → 0.17.0` — note the older `-rc1` form predates the current validator and would be rejected today; use `-rc.1`). + +## Bumping Steps + +### 1. Read the current VERSION + +```bash +airstack version +# → AirStack Version: 0.18.0-alpha.7 +``` + +(Equivalent: `grep '^VERSION=' .env`.) + +### 2. Edit `.env` + +Open `/.env` and change exactly the `VERSION=` line. Keep the surrounding comments and quoting intact: + +```diff +- VERSION="0.18.0-alpha.7" ++ VERSION="0.18.0-alpha.8" +``` + +The validator strips surrounding `"` or `'`, so either quoting style works, but match the existing style (double quotes). + +### 3. Update `CHANGELOG.md` + +Add an entry under `## [Unreleased]` describing your change (see "CHANGELOG Conventions" below). For a true release (no pre-release suffix), promote `[Unreleased]` to a new dated version section. + +### 4. Verify locally + +```bash +airstack version # prints the new value +grep '^VERSION=' .env # sanity-check the literal line +git diff .env CHANGELOG.md # review the diff +``` + +Optional regex preflight (mirrors the CI check): + +```bash +python3 -c 'import re,sys; v=open(".env").read(); m=re.search(r"^VERSION\s*=\s*\"?([^\"#\s]+)", v, re.M); print(m.group(1)); assert re.fullmatch(r"^(\d+)\.(\d+)\.(\d+)(?:-(alpha|beta|rc)\.(\d+))?$", m.group(1)), "INVALID"' +``` + +### 5. Commit + +Use a clear, conventional message: + +``` +Bump version to 0.18.0-alpha.8 +``` + +Recent commits in this repo use exactly this phrasing (`Bump version to 0.17.0`, `Bump version to 0.16.1`). + +## CHANGELOG Conventions + +`CHANGELOG.md` follows [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/) and Semantic Versioning. The literal layout in the repo is: + +```markdown +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- + +### Changed + +- + +### Fixed + +- + +## [1.0.0] - 2024-12-19 + +First official public release. + +### Added + +- + +### Fixed + +- + +### Changed + +- + +### Removed + +- +``` + +Rules: + +- Use the H2 sections **Added**, **Changed**, **Fixed**, **Removed**, **Deprecated**, **Security** as needed (Keep a Changelog standard set). +- For pre-release bumps (`-alpha.N`, `-beta.N`, `-rc.N`), keep your bullets under `## [Unreleased]`. Do not create a section per alpha. +- For a release bump (no suffix), rename `[Unreleased]` to `## [] - ` and add a fresh empty `## [Unreleased]` above it. +- Use ISO date format `YYYY-MM-DD`. +- Write user-facing prose, not commit log dumps. Mention new modules, breaking changes, and notable behavior shifts. + +## Common Pitfalls + +- **Forgetting the bump.** The `check-version-increment` job fails with `::error::VERSION must be strictly greater than the base branch version.` Bump and force-push the branch. +- **Invalid semver.** Forms like `1.2`, `1.2.3-rc1`, `1.2.3-dev`, `1.2.3+sha.abc`, `v1.2.3`, or empty strings fail with `::error::VERSION '' does not match the required format.` The only allowed pre-release tags are exactly `alpha`, `beta`, `rc`, each followed by a literal dot and an integer (e.g. `-rc.1`, never `-rc1`). +- **Going backwards.** `0.18.0 → 0.18.0-rc.1` looks like progress but is a regression: release > rc. Always move forward in the comparison tuple. +- **Two PRs racing for the same number.** Whichever merges last wins; the loser's `check-version-increment` will start failing the moment the base advances past it. Rebase on the updated base branch and bump again. +- **Bumping but forgetting the CHANGELOG.** No CI gate enforces this, but reviewers will (and the release docs workflow lists what shipped per version, so missing entries become invisible history). +- **Bumping for pure docs PRs.** Wastes a registry tag. Prefer to keep docs-only changes off `.env` if possible — but if the gate is failing, an alpha bump is the path of least resistance. +- **Editing `VERSION=` quoting.** The extractor regex `^VERSION\s*=\s*["\']?([^"\'#\s]+)` handles double quotes, single quotes, or no quotes, and stops at `#`/whitespace. Don't add inline comments after the value (e.g. `VERSION="0.18.1" # bumped`) — the trailing `# bumped` will be stripped from the value but obscures intent; put comments on their own line above. +- **Touching only sub-compose `.env` files.** The check looks at the **repo-root** `.env` only. `robot/docker/.env` and friends are container env files, not the version source of truth. +- **Force-pushing after merge to fix CHANGELOG.** Don't. Land a follow-up PR with the CHANGELOG correction (and, by the rules above, another tiny VERSION bump). + +## Release Checklist + +For a normal feature/fix PR: + +1. [ ] Confirm the PR changes Docker image content or otherwise warrants a bump (see "When to Use"). +2. [ ] Pick the bump type (see "Choosing the Bump Type"). +3. [ ] Edit `/.env` — change only the `VERSION=` line. +4. [ ] Update `CHANGELOG.md` under `## [Unreleased]`. +5. [ ] `airstack version` and `git diff .env CHANGELOG.md` to verify. +6. [ ] Commit (`Bump version to ` is the established style). +7. [ ] Push and open the PR. Confirm `Check VERSION Increment` passes green. +8. [ ] After review, merge into `develop` (or `main` per branch policy). +9. [ ] Watch `Auto Build on Docker Image Tag Change` on the merged commit. It runs on the ephemeral GPU runner; expect ~10–60 min depending on profiles. It must succeed before downstream consumers can `docker compose pull` the new tag. + +For a true release (dropping the pre-release suffix): + +1. [ ] Land final fixes on `develop` with `-rc.N` bumps. +2. [ ] Open a PR that bumps `VERSION="X.Y.Z-rc.N"` → `VERSION="X.Y.Z"`. +3. [ ] In the same PR, promote `## [Unreleased]` to `## [X.Y.Z] - YYYY-MM-DD` and add a fresh empty `## [Unreleased]`. +4. [ ] Merge to `main`. +5. [ ] Wait for `docker-build.yml` to push and sign all images. +6. [ ] Create a GitHub Release with tag `X.Y.Z` (matching `VERSION` exactly). Publishing the release fires `deploy_docs_from_release.yaml`, which runs `mike deploy --push --update-aliases X.Y.Z latest` and updates the versioned docs site. +7. [ ] Verify the docs site shows the new version under the version selector and that `latest` resolves to it. + +## References + +- [`/.env`](../../../.env) — source of truth for `VERSION=` +- [`/CHANGELOG.md`](../../../CHANGELOG.md) — release history +- [`/.github/workflows/check-version-increment.yml`](../../../.github/workflows/check-version-increment.yml) — the PR gate (semver regex lives here) +- [`/.github/workflows/docker-build.yml`](../../../.github/workflows/docker-build.yml) — build/push/sign on tag change +- [`/.github/workflows/deploy_docs_from_release.yaml`](../../../.github/workflows/deploy_docs_from_release.yaml) — versioned docs on release +- [`/.github/workflows/deploy_docs_from_main.yaml`](../../../.github/workflows/deploy_docs_from_main.yaml) and [`deploy_docs_from_develop.yaml`](../../../.github/workflows/deploy_docs_from_develop.yaml) — branch-tracking docs aliases +- [`/airstack.sh`](../../../airstack.sh) — defines `airstack version` and `get_VERSION` (used everywhere image tags are built) +- [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/) +- [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html) + +## Related Skills + +- [`run-system-tests`](../run-system-tests) — what fires on every PR alongside the version check +- [`update-documentation`](../update-documentation) — for docs-only PRs that may still need a VERSION bump to clear the gate diff --git a/.agents/skills/capture-discovered-knowledge/SKILL.md b/.agents/skills/capture-discovered-knowledge/SKILL.md new file mode 100644 index 000000000..d362a62ad --- /dev/null +++ b/.agents/skills/capture-discovered-knowledge/SKILL.md @@ -0,0 +1,134 @@ +--- +name: capture-discovered-knowledge +description: Persist hard-won discoveries to AGENTS.md or a new/existing SKILL.md so future agents don't repeat the discovery cost. Trigger after any long context-discovery task (multi-minute grep / file-reading session, parallel research agents, debugging that took several iterations) or whenever you learn something critical, surprising, undocumented, or that contradicted prior assumptions in AGENTS.md or a skill. Decides between updating AGENTS.md, updating an existing skill, or creating a new skill. +license: Apache-2.0 +metadata: + author: AirLab CMU + repository: AirStack +--- + +# Skill: Capture Discovered Knowledge for Future Agents + +## When to Use + +Fire this skill at the **end** of a task when any of the following is true: + +- The task required a long context-discovery phase: many greps across the repo, reading multiple unfamiliar files, or running parallel research agents. +- You learned a mechanism that was **not documented** in [AGENTS.md](../../../AGENTS.md), the relevant skill, or the package README — and a future agent would need it to do similar work. +- A discovery **contradicted** something stated in AGENTS.md or a skill (a stale claim, a renamed file, a removed flag, a wrong path). +- A debugging session resolved on a non-obvious cause (env var that must be unset, a hostname-vs-container-name dispatch, a hidden config precedence rule). +- You produced a new skill, deprecated one, or merged two — the registry in AGENTS.md must reflect that. +- A user-issued correction revealed a project convention that isn't written down. + +**Strong trigger:** the task ran more than ~5 minutes and most of that time was reading rather than writing. That cost should be amortized. + +## When NOT to Use + +Persisting noise is worse than persisting nothing. Skip this skill when: + +- The discovery is already captured in code (well-named symbols, an existing config key, a clear comment) — the code itself is the source of truth. +- The fact is only true for the current branch or PR (in-flight refactors, temporary workarounds being removed in the same PR). +- The information is conversation-scoped (current task progress, a plan, a TODO list). +- AGENTS.md or an existing skill *already* covers it. Re-read before adding. +- It's a one-off bug fix with no recurrence risk. The commit message is enough. + +If unsure, prefer NOT writing. AGENTS.md is loaded into every agent's context — bloat has a real cost. + +## Decision: Where Does This Knowledge Belong? + +Use this tree, top-down. Stop at the first match. + +1. **Does it correct a wrong claim** in [AGENTS.md](../../../AGENTS.md) or an existing SKILL.md? + → Edit that file directly. Do not duplicate the correction elsewhere. + +2. **Is it a project-wide mechanism** that any agent might encounter (env var flow, container lifecycle gotcha, CI gate, naming convention, where state is computed)? + → Add a short paragraph or table row to [AGENTS.md](../../../AGENTS.md). Pick the best existing section; only create a new H2/H3 if the topic is genuinely new. + +3. **Is it a multi-step workflow** (more than ~5 distinct steps, or with conditional branches) that recurs? + → Update an existing skill if one is close, otherwise create a new skill under [.agents/skills/](..). Then add a row to the skills table in AGENTS.md. + +4. **Is it scoped to a single module / package**? + → Update that package's `README.md`, not AGENTS.md. + +5. **Is it about *how to write code* in a particular subsystem** (style, idiom, helper to prefer)? + → Update the relevant skill if one exists. If not and it's broadly useful, consider a new skill. If narrow, leave it for the code reviewer. + +## Updating AGENTS.md + +Keep edits surgical. AGENTS.md is read on every agent invocation, so every line should earn its place. + +**Do:** + +- Lead with the mechanism in one sentence, then the file/line that's authoritative. +- Link to source files with markdown so the reader can verify (`[robot/docker/.bashrc](robot/docker/.bashrc)`). +- Put corrections inline where the wrong claim used to live — don't leave the wrong claim and append a footnote. +- If you're adding more than ~10 lines, ask whether it really belongs in a skill instead. + +**Don't:** + +- Restate things the file already says. +- Add aspirational guidance ("we should…", "TODO: document…"). AGENTS.md is descriptive of the current state. +- Add emojis or decorative formatting beyond what the file already uses. +- Re-paragraph long-line content to satisfy MD013; the file's prevailing style ignores it. + +## Creating a New Skill + +Only create a new skill when the topic is **recurring**, **non-trivial**, and **doesn't fit into an existing skill without distorting it**. The bar is high: 9 → 14 skills is a one-week jump; 14 → 30 is unmanageable. + +Steps: + +1. Pick a verb-led kebab-case name: `verb-object` (e.g., `add-ros2-package`, `bump-version-and-release`). Avoid noun-only names — they don't read well in trigger descriptions. +2. Read 2 existing skills closest in style to yours. Match their frontmatter shape exactly: `name`, `description`, `license: Apache-2.0`, `metadata.author: AirLab CMU`, `metadata.repository: AirStack`. +3. Write the description as a **one-line trigger** that contains the words an agent would use when describing the task. The trigger model fires on this string, so include the keywords (commands, file names, error phrases, env vars). +4. Section structure: `## When to Use` → numbered or topical body → `## Common Pitfalls` → optional skeleton/cheatsheet at the end. +5. Verify all claims against the actual files. If you can't find a referenced file, the skill is wrong. +6. Add the skill to the table in [AGENTS.md](../../../AGENTS.md) under "Common Workflows (Skills)". One line, one clear when-to-use phrase. +7. Do **not** update unrelated skills with cross-references unless it's load-bearing — the trigger system handles routing. + +## Updating an Existing Skill + +When the discovery extends rather than replaces an existing skill: + +- Edit the relevant section in place. Don't append a "Recent Findings" trailer — fold it in. +- If the change makes the description-line trigger inaccurate, update the description too. +- If a "Common Pitfalls" section exists, that's usually the right home for one-line gotchas. +- Bump nothing — skills aren't versioned. + +## Deprecating or Merging Skills + +If a discovery makes a skill redundant or wrong as a whole: + +- Prefer merging into the surviving skill rather than leaving a stub. +- Delete the old `SKILL.md` and the directory. +- Remove the row from the AGENTS.md skills table. +- If anything else in the repo links to the old skill path, update those links in the same edit. + +## Quality Bar + +Before saving, confirm: + +- [ ] The claim is verified against the current code, not memory or a prior conversation. +- [ ] A future agent reading only this addition (no surrounding context) would understand it. +- [ ] The "why" or "how it ends up that way" is included when the mechanism is non-obvious — not just the end-state fact. +- [ ] The addition is in the smallest scope that fits (package README < skill < AGENTS.md). +- [ ] No duplicate of an existing claim. + +## Common Pitfalls + +1. **Over-saving** — turning every bug fix into an AGENTS.md note. The git history is also memory; trust it. +2. **Wrong scope** — putting module-specific quirks in AGENTS.md instead of the package README, or vice versa. +3. **Restating code** — if a function name or config key already explains the behavior, the doc adds noise. +4. **Stale references** — linking files by path without verifying they still exist at that path. +5. **Description bloat** — skill descriptions that pile on keywords stop firing reliably. Keep them as a single, concrete trigger sentence. +6. **Forgetting the registry** — creating a new skill but not adding it to the AGENTS.md table makes it invisible to future agents. +7. **Saving in the wrong direction** — recording "I tried X and it failed" instead of "the mechanism is Y." Future agents need the mechanism, not the trial. + +## Quick Self-Check at End of Task + +Three questions, ~10 seconds: + +1. *If a different agent picked up this same task tomorrow, is there anything they'd waste time rediscovering?* +2. *Did anything I just learned contradict AGENTS.md or a skill?* +3. *Did I create, merge, or deprecate a skill?* + +If any answer is yes → invoke this skill. If all are no → you're done. diff --git a/.agents/skills/configure-multi-robot/SKILL.md b/.agents/skills/configure-multi-robot/SKILL.md new file mode 100644 index 000000000..4fc977472 --- /dev/null +++ b/.agents/skills/configure-multi-robot/SKILL.md @@ -0,0 +1,391 @@ +--- +name: configure-multi-robot +description: Configure, name, and isolate multiple robots in AirStack. Use whenever launching multi-robot, multiple robots, swarm, or fleet scenarios; setting ROBOT_NAME; debugging cross-robot topic collisions; choosing a ROS_DOMAIN_ID; or namespacing topics, TF frames, and DDS bridges across robots. +license: Apache-2.0 +metadata: + author: AirLab CMU + repository: AirStack +--- + +# Skill: Configure Multi-Robot Setup + +## When to Use + +Reach for this skill any time you: + +- Spawn more than one robot in simulation (`NUM_ROBOTS > 1`) +- Deploy multiple physical aircraft (VOXL, Jetson, etc.) +- Debug topic collisions, missing topics on `/robot_2/...`, or "two robots talking on the same topic" +- Write a new launch file or YAML config that hardcodes a topic path +- Vary `--num-robots` in the system test suite (`tests/`) +- Add a node that subscribes to or publishes a robot-specific topic +- Set up the `desktop_split` / `offboard` profile where global planning runs on a separate machine + +If you only ever touch one robot, you can usually skip this skill — but the moment a second drone enters the picture, every assumption about hardcoded `/drone1/...` topic names becomes a bug. + +## Prerequisites + +- Familiarity with the AirStack docker-compose layout (`.env`, `robot/docker/`, `simulation/{isaac-sim,ms-airsim}/`) +- Basic understanding of ROS 2 namespaces and TF frame names +- You have already read [`docs/robot/docker/robot_identity.md`](../../../docs/robot/docker/robot_identity.md), or are willing to as you go — that file is the canonical reference for the resolution mechanism + +## How ROBOT_NAME Flows Through the Stack + +`ROBOT_NAME` is **not** a single static value. It is computed per container at shell start by `robot/docker/.bashrc` and propagated into every ROS launch substitution. The full chain: + +``` +.env (ROBOT_NAME_MAP_CONFIG_FILE, NUM_ROBOTS) + │ + ▼ +docker-compose.yaml (ROBOT_NAME_SOURCE=container_name | hostname, + deploy.replicas: ${NUM_ROBOTS:-1}) + │ + ▼ +robot/docker/.bashrc (runs on container shell start) + │ + ├─ ROBOT_NAME_SOURCE=container_name → resolve `hostname` back to docker container name + │ (e.g. `airstack-robot-desktop-1`) + ├─ ROBOT_NAME_SOURCE=hostname → use OS hostname directly (`robot-1` on real HW) + │ + ▼ +robot/docker/robot_name_map/resolve_robot_name.py + applies regex rules from $ROBOT_NAME_MAP_CONFIG_FILE + │ + ▼ +exports ROBOT_NAME=robot_ (e.g. robot_1) + ROS_DOMAIN_ID= (e.g. 1) + │ + ▼ +ros2 launch reads $(env ROBOT_NAME) → topic remappings, push_ros_namespace, + MAVROS FCU URLs, DDS allowlists, etc. +``` + +The default mapping rule in [`robot/docker/robot_name_map/default_robot_name_map.yaml`](../../../robot/docker/robot_name_map/default_robot_name_map.yaml): + +```yaml +- pattern: '.*robot-.*(\d+)' + robot: 'robot_{1}' + domain_id: '{1}' +- pattern: '.*' # catch-all + robot: 'unknown-robot' + domain_id: '0' +``` + +So `airstack-robot-desktop-1` → `ROBOT_NAME=robot_1`, `ROS_DOMAIN_ID=1`. Replica `2` → `robot_2`, domain `2`. Etc. **The container name is the source of truth in simulation.** + +The top-level [`autonomy_bringup/launch/robot.launch.xml`](../../../robot/ros_ws/src/autonomy_bringup/launch/robot.launch.xml) then pushes this name as the root namespace for every node it spawns: + +```xml + +``` + +Every layer-bringup launch file underneath inherits that namespace, and every cross-robot remap uses `/$(env ROBOT_NAME)/...` to reach back out to the absolute path. + +## Configuring a Single Robot + +The default config in `.env` already runs one robot. You almost never need to set `ROBOT_NAME` directly; instead let the resolver compute it: + +```bash +# .env +NUM_ROBOTS="1" +ROBOT_NAME_MAP_CONFIG_FILE="default_robot_name_map.yaml" +``` + +```bash +airstack up +docker exec airstack-robot-desktop-1 bash -c 'echo $ROBOT_NAME $ROS_DOMAIN_ID' +# robot_1 1 +``` + +If you need a non-default name (custom hostname scheme on a physical robot, or you want `drone_alpha` instead of `robot_1`), write a new mapping YAML in `robot/docker/robot_name_map/` and point `ROBOT_NAME_MAP_CONFIG_FILE` at it. Do **not** hardcode `ROBOT_NAME=...` in compose unless you know what you are doing — it bypasses the resolver and you lose `ROS_DOMAIN_ID` co-assignment. + +For a one-off override (e.g. ad hoc debugging): + +```bash +docker exec -e ROBOT_NAME=robot_5 -e ROS_DOMAIN_ID=5 -it airstack-robot-desktop-1 bash +``` + +## Launching Multiple Robots + +AirStack launches multiple robots as **replicas of the same container**, not as multiple namespaces inside one container. Look at [`robot/docker/docker-compose.yaml`](../../../robot/docker/docker-compose.yaml): + +```yaml +robot-desktop: + ... + deploy: + replicas: ${NUM_ROBOTS:-1} +``` + +So `NUM_ROBOTS=3 airstack up` produces **three** robot containers (`airstack-robot-desktop-1`, `-2`, `-3`), each with its own `ROBOT_NAME` and its own `ROS_DOMAIN_ID`. Each container runs the full autonomy stack independently. Cross-robot communication, when needed, goes through the DDS router (see [`onboard_all/config/dds_router.yaml`](../../../robot/ros_ws/src/autonomy_bringup/onboard_all/config/dds_router.yaml)) which bridges allowlisted topics from each per-robot domain into a shared GCS domain. + +```bash +NUM_ROBOTS=3 airstack up +docker ps --format '{{.Names}}' | grep robot-desktop +# airstack-robot-desktop-1 +# airstack-robot-desktop-2 +# airstack-robot-desktop-3 +``` + +The simulator side has to spawn matching vehicles — see [Sim-Side Robot Spawning](#sim-side-robot-spawning). + +### `onboard_all` vs `onboard_local_offboard_global` + +[`autonomy_bringup`](../../../robot/ros_ws/src/autonomy_bringup/) ships two layouts, selected by the `role` arg / `AUTONOMY_ROLE` env var: + +| Variant | Role values | What runs onboard | What runs offboard | When to use | +|--------|-------------|-------------------|--------------------|-------------| +| `onboard_all` | `role:=full` | interface, sensors, perception, local, **global**, behavior | nothing | Sim/dev desktop, autonomous Jetson with enough compute, single-machine deployments | +| `onboard_local_offboard_global` | `role:=onboard` (lite) + `role:=offboard` (GCS) | interface, sensors, perception, local, behavior | global planning + mapping | VOXL / lite Jetson where global planning is offloaded to a ground station; `desktop_split` profile for debugging the split | + +The split is significant for multi-robot: with `onboard_local_offboard_global`, **one offboard container is launched per robot** (also via `replicas: ${NUM_ROBOTS}`), all on `ROS_DOMAIN_ID=0`, and each bridges into its own per-robot onboard domain via the domain bridge config in `onboard_local_offboard_global/config/dds_router.yaml`. See [`docs/robot/autonomy_modes.md`](../../../docs/robot/autonomy_modes.md) for the profile matrix. + +## Topic and TF Namespacing + +### Topics + +Every cross-module topic must be prefixed with `/$(env ROBOT_NAME)/...`. The standard topics used across the stack (also catalogued in `AGENTS.md` under "Standard Topic Patterns"): + +| Topic Pattern | Type | Purpose | +|---|---|---| +| `/{robot_name}/odometry` | `nav_msgs/Odometry` | Robot state estimate | +| `/{robot_name}/odometry_conversion/odometry` | `nav_msgs/Odometry` | Reframed odometry into AirStack frames | +| `/{robot_name}/global_plan` | `nav_msgs/Path` | Global waypoint path | +| `/{robot_name}/trajectory_controller/trajectory_override` | `airstack_msgs/TrajectoryOverride` | Direct trajectory commands | +| `/{robot_name}/trajectory_controller/trajectory_segment_to_add` | `airstack_msgs/TrajectorySegment` | Planned segment | +| `/{robot_name}/trajectory_controller/look_ahead` | `geometry_msgs/PointStamped` | Look-ahead point | +| `/{robot_name}/interface/mavros/local_position/odom` | `nav_msgs/Odometry` | MAVROS-published odom | +| `/{robot_name}/tasks/takeoff` | `task_msgs/action/TakeoffTask` | Takeoff action server | +| `/{robot_name}/tasks/land` | `task_msgs/action/LandTask` | Landing action server | + +Pattern in launch XML — do this in **every** new module: + +```xml + + +``` + +In node code, subscribe/publish using **relative names** (e.g. `odometry`) and let the launch file remap. Never write `self.create_subscription(..., "/drone1/odometry", ...)`. + +### TF frames + +TF frames in AirStack are **also** namespaced under the robot, but the namespacing happens because TF in ROS 2 honors the publishing node's namespace. The top-level launch pushes `$(env ROBOT_NAME)` as namespace, so a node publishing `base_link` ends up with the resolved frame `robot_1/base_link`. + +Standard frame names you will see (per robot): + +- `{robot_name}/base_link` — body-fixed frame +- `{robot_name}/base_link_stabilized` — yaw-only-rotated body frame +- `{robot_name}/odom` — odometry origin +- `{robot_name}/look_ahead_point_stabilized` — controller look-ahead + +Two static frames are **shared** across robots: + +- `world` — global root +- `map` — global map frame, anchored at `world` + +The static `world` → `map` broadcaster is launched once per robot inside [`robot.launch.xml`](../../../robot/ros_ws/src/autonomy_bringup/launch/robot.launch.xml) — multiple robots publish the identical transform, which TF accepts as redundant. Do not rename `map` per-robot; many global planners and the GCS assume `map` is the shared global frame. + +If you write a node that hardcodes a TF frame string, prefer relative frame IDs (`base_link`, `odom`) over absolute ones — the namespace prefix gets added automatically. If you must use an absolute name, build it from the env var: + +```python +robot_name = os.environ["ROBOT_NAME"] +self.target_frame = f"{robot_name}/base_link" +``` + +## Sim-Side Robot Spawning + +Both simulators read `NUM_ROBOTS` from the environment and spawn matching vehicles named `robot_1`, `robot_2`, … so the names line up with what the resolver assigns to robot containers. + +### Microsoft AirSim (legacy) + +[`simulation/ms-airsim/config/generate_settings.py`](../../../simulation/ms-airsim/config/generate_settings.py) reads `NUM_ROBOTS` and renders [`settings.json.j2`](../../../simulation/ms-airsim/config/settings.json.j2) into AirSim's `settings.json`. The Jinja loop produces one `Vehicles.robot_` block per robot, each with its own `TcpPort` (`4561 + i`), `ControlPortLocal` (`24541 + i`), and spawn offset (`Y = (i-1) * spawn_spacing`): + +```jinja +{% for i in range(1, num_robots + 1) %} +"robot_{{ i }}": { + "VehicleType": "PX4Multirotor", + "TcpPort": {{ 4560 + i }}, + ... + "Y": {{ (i - 1) * spawn_spacing }} +} +{% endfor %} +``` + +The `ms-airsim` container's `entrypoint.sh` (in `simulation/ms-airsim/docker/`) loops `for i in $(seq 1 "$NUM_ROBOTS")` to start one PX4 SITL instance per vehicle. AirSim binds them via the per-robot TCP ports. + +### Isaac Sim (Pegasus) + +[`simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py`](../../../simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py) reads `NUM_ROBOTS` and calls `spawn_drone(i)` in a loop. Each drone is created with `robot_name=f"robot_{index}"`, `vehicle_id=index`, `domain_id=index`, and an X offset for spacing: + +```python +NUM_ROBOTS = int(os.environ.get("NUM_ROBOTS", "1")) +... +for i in range(1, NUM_ROBOTS + 1): + spawn_drone(i) +``` + +To use the multi-drone launcher, set in `.env`: + +``` +ISAAC_SIM_SCRIPT_NAME="example_multi_px4_pegasus_launch_script.py" +``` + +(The default `example_one_px4_pegasus_launch_script.py` only spawns one.) + +### Test harness + +The `airstack_env` fixture in [`tests/conftest.py`](../../../tests/conftest.py) parametrizes tests over `(sim, num_robots, iteration)` and sets: + +```python +env_overrides = { + "NUM_ROBOTS": str(num_robots), + ... +} +``` + +Tests that act on robots iterate `n=1..num_robots` and address them as `/robot_{n}/...` directly (see `_takeoff_one_robot` in `tests/test_takeoff_hover_land.py`). The test sets `ROS_DOMAIN_ID=n` for each per-robot subprocess (`domain_id=n` in `ros2_exec(...)`), matching what the resolver assigned inside the container. **If you write a new test that talks to a robot, follow this same `domain_id=n` + `/robot_{n}/...` pattern.** + +CLI passthrough: + +```bash +airstack test -m takeoff_hover_land --sim msairsim --num-robots 1,3 -v +``` + +## Common Pitfalls + +### 1. Hardcoding the robot name in topics + +```xml + + + + + + +``` + +```python +# WRONG +self.create_subscription(Odometry, "/robot_1/odometry", cb, 10) + +# RIGHT (let launch remap a relative name) +self.create_subscription(Odometry, "odometry", cb, 10) +``` + +### 2. Forgetting `allow_substs="true"` on YAML param files + +If your `config/*.yaml` references `$(env ROBOT_NAME)` or other substitutions, you must opt in: + +```xml + +``` + +Without `allow_substs="true"`, the substitution string is loaded literally and the node sees `frame_id: "$(env ROBOT_NAME)/base_link"` instead of `frame_id: "robot_1/base_link"`. + +### 3. Two robots sharing one ROS_DOMAIN_ID + +If two robots share a domain, every topic collides — both `/robot_1/odometry` publishers will be visible to both subscribers, and DDS will sometimes deliver crossed data. The default `robot_name_map` derives the domain from the robot index, so this only happens if you: + +- Hardcode `ROS_DOMAIN_ID` in compose to the same value for two replicas +- Use a hostname that doesn't match any rule and falls through to the catch-all (both robots get `unknown-robot`, domain `0`) + +Always verify after starting: + +```bash +for c in $(docker ps --format '{{.Names}}' | grep robot-desktop); do + echo "$c: $(docker exec $c bash -c 'echo $ROBOT_NAME $ROS_DOMAIN_ID')" +done +``` + +### 4. Running multiple `airstack up` instances without isolating domains + +Two developers on the same LAN running `airstack up` will see each other's robots if `ROS_DOMAIN_ID` happens to match. The `airstack_network` bridge in compose isolates the **container** network but DDS multicast can still leak over the host's actual network depending on the discovery config. If you are sharing a LAN, set a distinct `ROBOT_NAME_MAP_CONFIG_FILE` that maps to a non-overlapping domain range (e.g. one developer uses domains 1-3, another 11-13). + +### 5. Test harness defaulting to 1 robot + +`tests/conftest.py` defaults to `--num-robots 1,3`. If you wrote a test that assumes exactly one or exactly three robots, restrict the parametrization in your own test or guard with `pytest.skip(...)`. Don't rely on the harness picking your expected count. + +### 6. Forgetting to pass `NUM_ROBOTS` to the simulator container + +Both Isaac Sim and AirSim read `NUM_ROBOTS` themselves at startup. `airstack up` and the test harness propagate it for you, but if you start the simulator alone (e.g. `docker compose up isaac-sim`), the simulator will spawn 1 drone regardless of how many robot containers you started. Always set `NUM_ROBOTS` at the top-level invocation, not after the simulator is already running. + +### 7. Hardcoded TF frame `base_link` from outside the namespace + +A node running outside the robot namespace (e.g. a GCS node, or something launched from `gcs/`) cannot just look up `base_link` — it needs the full `robot_1/base_link`. Build the frame name from the robot you mean to reach: + +```python +target_frame = f"{robot_name}/base_link" +``` + +### 8. `push_ros_namespace` with an absolute remap + +This is a common foot-gun: + +```xml + + + + +``` + +Either keep the remap relative (`to="odometry"`) so it joins the namespace, or write the full path explicitly (`to="/$(env ROBOT_NAME)/odometry"`). + +### 9. Hostname doesn't match any rule on real robots + +On VOXL/Jetson with `ROBOT_NAME_SOURCE=hostname`, the device hostname must match a rule in the mapping YAML. If `hostname` returns `airlab-jetson-42` and your config only matches `robot-N`, the resolver exits non-zero and `ROBOT_NAME` is unset — the autonomy stack will then launch with empty namespaces and break in confusing ways. Either rename the device or extend the mapping config. + +## Pre-Merge Checklist + +Before merging a change that touches anything robot-namespaced: + +- [ ] No `/drone1/...` or `/robot_1/...` literals in any code, config, or launch file you added or modified — search with `grep -rn '/robot_[0-9]\|/drone[0-9]' ` +- [ ] Every cross-module topic uses `$(env ROBOT_NAME)` (in launch files) or a relative name remapped at launch time (in node code) +- [ ] Every YAML config file that references `$(env ...)` is loaded with `allow_substs="true"` +- [ ] TF frames in node code are either relative (`base_link`, `odom`) or built from `os.environ["ROBOT_NAME"]` +- [ ] If you added a new module to a layer bringup, you tested it with `NUM_ROBOTS=2` and confirmed both robots' namespaces look identical under `ros2 node list` +- [ ] If you added a sim launch script, it reads `NUM_ROBOTS` and spawns vehicles named `robot_1`, `robot_2`, … with matching `vehicle_id` / `domain_id` +- [ ] If you added a system test that addresses a robot, it loops over `range(1, num_robots + 1)` and uses `domain_id=n` in `ros2_exec(...)` +- [ ] DDS router allowlists in `onboard_all/config/dds_router.yaml` (or the split equivalent) include any new cross-domain topic your module exposes — otherwise it will not appear on the GCS +- [ ] Verified end-to-end: `NUM_ROBOTS=3 airstack up`, then `docker exec airstack-robot-desktop-2 bash -c 'ros2 topic list | grep robot_2'` shows the same topics that `airstack-robot-desktop-1` shows under `robot_1` + +## Verification Commands + +Quick checks while debugging: + +```bash +# Confirm each container resolved a distinct (ROBOT_NAME, ROS_DOMAIN_ID) +for c in $(docker ps --format '{{.Names}}' | grep robot-desktop); do + docker exec "$c" bash -c 'echo "$(hostname) -> ROBOT_NAME=$ROBOT_NAME ROS_DOMAIN_ID=$ROS_DOMAIN_ID"' +done + +# Each robot's nodes (run on the matching domain) +docker exec -e ROS_DOMAIN_ID=1 airstack-robot-desktop-1 bash -c \ + "source /opt/ros/jazzy/setup.bash && ros2 node list" +docker exec -e ROS_DOMAIN_ID=2 airstack-robot-desktop-2 bash -c \ + "source /opt/ros/jazzy/setup.bash && ros2 node list" + +# Robots talking to each other? Use the GCS domain (0) and check the router-bridged topics +docker exec airstack-gcs-1 bash -c \ + "source /opt/ros/jazzy/setup.bash && ROS_DOMAIN_ID=0 ros2 topic list | grep -E 'robot_[0-9]+'" + +# TF tree for one robot +docker exec -e ROS_DOMAIN_ID=1 airstack-robot-desktop-1 bash -c \ + "source /opt/ros/jazzy/setup.bash && ros2 run tf2_tools view_frames" +``` + +## References + +- [`docs/robot/docker/robot_identity.md`](../../../docs/robot/docker/robot_identity.md) — canonical reference for the resolution mechanism +- [`docs/robot/autonomy_modes.md`](../../../docs/robot/autonomy_modes.md) — profile matrix (`desktop`, `desktop_split`, `voxl`, `l4t`, `offboard`) +- [`robot/docker/robot_name_map/`](../../../robot/docker/robot_name_map/) — mapping YAMLs and `resolve_robot_name.py` +- [`robot/ros_ws/src/autonomy_bringup/launch/robot.launch.xml`](../../../robot/ros_ws/src/autonomy_bringup/launch/robot.launch.xml) — top-level `push_ros_namespace` +- [`robot/ros_ws/src/autonomy_bringup/onboard_all/config/dds_router.yaml`](../../../robot/ros_ws/src/autonomy_bringup/onboard_all/config/dds_router.yaml) — cross-domain allowlist pattern +- [`simulation/ms-airsim/config/generate_settings.py`](../../../simulation/ms-airsim/config/generate_settings.py) and [`settings.json.j2`](../../../simulation/ms-airsim/config/settings.json.j2) +- [`simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py`](../../../simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py) +- [`tests/conftest.py`](../../../tests/conftest.py) — `airstack_env` fixture and `--num-robots` parametrization + +## Related Skills + +- [integrate-module-into-layer](../integrate-module-into-layer) — every remap in your module must use `$(env ROBOT_NAME)` +- [write-launch-file](../write-launch-file) — patterns for env substitution and namespace pushing +- [test-in-simulation](../test-in-simulation) — multi-robot test scenarios +- [debug-module](../debug-module) — diagnosing topic/namespace issues diff --git a/.agents/skills/run-system-tests/SKILL.md b/.agents/skills/run-system-tests/SKILL.md new file mode 100644 index 000000000..6f6bf84ff --- /dev/null +++ b/.agents/skills/run-system-tests/SKILL.md @@ -0,0 +1,409 @@ +--- +name: run-system-tests +description: Run, interpret, and extend AirStack's pytest system test suite (build_packages, build_docker, liveliness, sensors, takeoff_hover_land), trigger runs via /pytest PR comments, and read metrics.json regression reports. Use for invoking tests, debugging failures from results.xml/metrics.json, or adding a new system test. +license: Apache-2.0 +metadata: + author: AirLab CMU + repository: AirStack +--- + +# Skill: Run AirStack System Tests + +## When to Use + +Use this skill when you need to: + +- Invoke the pytest system tests locally (via `airstack test`) or on CI (via `/pytest` PR comment or `workflow_dispatch`) +- Diagnose a failing system test — interpret `results.xml`, per-test logs, and `metrics.json` from `tests/results//` +- Compare metrics against a baseline run (`parse_metrics.py --baseline`) to confirm a regression or improvement +- Add a new system test to `tests/`: pick the right mark, wire up `airstack_env` parametrization, and record metrics with `MetricsRecorder` + +This skill is about the **test harness itself** — pytest marks, fixtures, the metrics pipeline, the CI trigger, and the ephemeral runner constraints. For authoring per-module simulation scenarios (waypoint sequences, RViz checks, scene design), use the `test-in-simulation` skill instead. For diagnosing why a specific module isn't behaving correctly inside a passing harness, use `debug-module`. + +## Test Suite Overview + +The suite lives at `tests/` (repo root) and is fully pytest-based. Configuration is in `tests/pytest.ini` and shared infrastructure in `tests/conftest.py`. Marks include `build_docker`, `build_packages`, `liveliness`, `sensors`, and `takeoff_hover_land`: + +| File | Mark | What it tests | Hardware required | +|------|------|---------------|-------------------| +| `tests/test_build_docker.py` | `build_docker` | `airstack image-build` for `robot-desktop`, `gcs`, `isaac-sim`, `ms-airsim`; records image size to `metrics.json` | Docker daemon | +| `tests/test_build_packages.py` | `build_packages` | `colcon build` (`bws`) inside the robot, GCS, and ms-airsim ROS workspaces — brought up with `AUTOLAUNCH=false` | Docker daemon | +| `tests/test_liveliness.py` | `liveliness` | Stack bring-up: containers Running, `/clock` readiness, tmux panes, sentinel ROS 2 nodes, compute, infra-only `test_stable` | Docker daemon, NVIDIA GPU + `nvidia-container-toolkit`, sim license / Omniverse creds | +| `tests/test_sensors.py` | `sensors` | Topic Hz (Isaac: batched on sim + robot; LiDAR `echo-once` + cloud sanity), RTF, `test_sensor_streams_stable` | Docker daemon, NVIDIA GPU + `nvidia-container-toolkit`, sim license / Omniverse creds | +| `tests/test_takeoff_hover_land.py` | `takeoff_hover_land` | 4-phase flight chain per `(sim, num_robots, iteration, velocity)`: `test_px4_ready` → `test_takeoff` → `test_hover` → `test_landing`. Records altitude error, overshoot, hover stability, landing accuracy, odometry drift | Docker daemon, NVIDIA GPU, sim license | + +The marks are declared in `tests/pytest.ini`. **Do not invent new marks ad-hoc** — register any new mark there or pytest will warn about unknown marks. + +### Test ordering (set by `pytest_collection_modifyitems`) + +`conftest.py` enforces a deterministic global order so cheap-and-fast-failing tests surface first: + +``` +test_build_docker → test_build_packages → test_liveliness → test_sensors → test_takeoff_hover_land +``` + +Within `test_takeoff_hover_land`, items are re-sorted to `(airstack_env, velocity, phase)` so each `(sim, robots, iter)` env brings the stack up once and the drone goes ground → air → ground per velocity before pytest moves to the next velocity. + +### Isaac Sim (`sensors`): why Hz is batched and LiDAR uses `echo --once` + +[`tests/sensor_probes.py`](../../../tests/sensor_probes.py) implements the `sensors` +mark. Pegasus / OmniGraph ROS bridges and the sim→robot path can stop reporting +rates if too many `ros2 topic hz` processes run concurrently. + +- **Sim-side (Isaac):** three `parallel_sample_hz` passes — `/clock`, then both + `image_rect` topics, then both `depth_ground_truth` topics. +- **Robot-side (Isaac):** two passes — both stereo images, then both depths. + **ms-airsim** keeps a single four-topic parallel batch on the robot container. +- **Filtered LiDAR** (`PointCloud2`): uses `ros2 topic echo --once` per robot + (see `parallel_echo_once_robot_topics` in `conftest.py`), not `topic hz`. +- **Multi-drone Pegasus script:** pytest sets `ENABLE_LIDAR=true` in + `conftest.py` `SIM_CONFIG["isaacsim"]["extra_env"]` so LiDAR matches the + single-drone example (which always enables RTX LiDAR). + +User-facing write-up: [`tests/README.md`](../../../tests/README.md) (section +*Isaac Sim and the sensors mark*). + +### `build_packages` is auto-prepended in CI + +The `system-tests.yml` workflow's `Parse pytest args` step automatically prepends `build_packages` to the marks expression whenever the user specifies any marks (and `build_packages` isn't already in the expression). For example: + +- `/pytest -m liveliness` → effectively runs `-m "build_packages or liveliness"` +- `/pytest -m sensors` → effectively runs `-m "build_packages or sensors"` +- `/pytest -m takeoff_hover_land` → effectively runs `-m "build_packages or takeoff_hover_land"` +- `/pytest` (no marks) → pytest defaults (everything) +- `/pytest -m build_docker` → unchanged (the build_docker tests rebuild from scratch anyway) + +This guarantees that ROS 2 workspaces are built inside the containers before any launch/liveliness test tries to source them. If you intentionally want to skip `build_packages` (e.g. you trust the prebuilt images), include it explicitly: `-m "liveliness and not build_packages"` would work, but the simpler path is to run locally where the prepend logic doesn't apply. + +When running **locally** via `airstack test` or `pytest` directly, no auto-prepend happens — you control marks exactly. + +## Running Tests Locally + +### Primary interface: `airstack test` + +`airstack test` builds a containerized test runner from `tests/docker/`, mounts the repo read-only, and forwards all args to pytest. No local Python environment is required. + +```bash +# Cheap build tests — no GPU needed +airstack test -m "build_docker or build_packages" -v + +# Liveliness — single sim, single robot, single iteration +airstack test -m liveliness \ + --sim msairsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --stable-duration 60 \ + -v + +# Sensors (sim + robot Hz, LiDAR, RTF) — Isaac example; runs after liveliness in collection order +airstack test -m sensors \ + --sim isaacsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --stable-duration 60 \ + -v + +# Takeoff/hover/land — sweep three velocities +airstack test -m takeoff_hover_land \ + --sim msairsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --takeoff-velocities 0.5,1,2 \ + -v + +# Visual inspection (disables headless mode; calls xhost + automatically) +airstack test -m liveliness --gui -v +``` + +### Direct pytest (for debugger / fast iteration) + +When iterating on the test code itself, run pytest directly to skip the runner-image rebuild: + +```bash +export AIRSTACK_ROOT=$(pwd) +pip install -r tests/requirements.txt +pytest tests/ -m liveliness --sim msairsim --num-robots 1 -v +``` + +### CLI option to parametrization mapping + +The `airstack_env` fixture is parametrized over `(sim, num_robots, iteration)` tuples by `pytest_generate_tests` in `conftest.py`. The fixture itself is **only** activated for tests that request it, so `build_docker` and `build_packages` (which don't bring up the full stack) are not multiplied: + +| Flag | Default | Affects | Becomes | +|------|---------|---------|---------| +| `--sim` | `msairsim,isaacsim` | `airstack_env` | One env-tuple per sim | +| `--num-robots` | `1,3` | `airstack_env` | Cross-product with sim | +| `--stress-iterations` | `1` | `airstack_env` | Up/down cycles per `(sim, num_robots)` | +| `--stable-duration` | `120` | `test_liveliness::test_stable` and `test_sensors::test_sensor_streams_stable` | Total seconds polled | +| `--stable-interval` | `10` | `test_liveliness::test_stable` and `test_sensors::test_sensor_streams_stable` | Seconds between polls | +| `--gui` | off (headless) | `airstack_env` | Sets `QT_QPA_PLATFORM=offscreen` when off | +| `--takeoff-velocities` | `0.5` (current default) | `test_takeoff_hover_land` | One full 4-phase chain per velocity | + +Total parametrize cardinality for sim tests = `len(sims) × len(num_robots) × stress_iterations × len(velocities for takeoff)`. Keep this small locally — a 2×2×3×3 sweep on a workstation is several hours. + +### Prerequisites + +- Docker daemon running, your user in the `docker` group +- For `liveliness` / `sensors` / `takeoff_hover_land`: NVIDIA driver + `nvidia-container-toolkit` +- For `isaacsim`: `simulation/isaac-sim/docker/omni_pass.env` populated with Omniverse credentials (CI generates a `guest`/`guest` version automatically) +- `airstack setup` already run so `airstack` is on `PATH` +- All required compose images present locally — `airstack_env` calls `missing_images()` and fails fast otherwise. Build them first via `airstack test -m build_docker` or `airstack image-build `. + +## Running Tests via PR Comment + +The `system-tests.yml` workflow accepts three trigger paths: + +1. **PR opened** (same-repo only) — auto-runs pytest with conftest defaults. Fork PRs are skipped to keep arbitrary code off the self-hosted runner. +2. **`/pytest` issue comment** on a PR — only honored from users with `OWNER`, `MEMBER`, or `COLLABORATOR` author association. Fork PRs are explicitly rejected by the `Resolve PR head` step (the PR's head repo must equal `${context.repo.owner}/${context.repo.repo}`). +3. **`workflow_dispatch`** — manual run from the Actions tab with form inputs (`marks`, `sim`, `num_robots`, `stress_iterations`, `stable_duration`, `baseline_run_id`). + +### `/pytest` comment grammar + +The first line of the comment is parsed via `shlex.split` after stripping the `/pytest` prefix; subsequent lines are treated as freeform context. Examples: + +``` +/pytest -m liveliness --sim msairsim --num-robots 1 --stress-iterations 1 +``` + +``` +/pytest -m takeoff_hover_land --takeoff-velocities 0.5,1 +notes: testing the new altitude controller +``` + +``` +/pytest +``` +(no args — pytest runs its conftest defaults) + +The workflow: +1. Posts an acknowledgment PR comment showing the resolved `pytest tests/ ` command and a link to the run +2. Opens an in-progress GitHub Check Run on the PR's head SHA so the run shows up in the **Checks** tab (issue_comment events otherwise associate runs with the default branch) +3. Runs pytest on a freshly-spawned ephemeral OpenStack runner (`runs-on: [self-hosted, airstack-ephemeral]`) +4. Uploads `tests/results/` as artifact `test-results--` (90-day retention) +5. The downstream `report` job runs `parse_metrics.py` against the latest baseline artifact from the PR's base branch and posts a markdown table back as a PR comment + job summary +6. Closes the Check Run with the final conclusion + +### Why fork PRs are blocked + +The runner is GPU-equipped, has Docker root access, and is reused (briefly) across the lifetime of one job. Running arbitrary fork code on it would let a contributor exfiltrate registry creds, mine crypto, or pivot into the OpenStack tenant. The same-repo guard is the only line of defense and **must not be removed**. If you need to test a fork PR, mirror the branch into the upstream repo first. + +## Interpreting Results and Metrics + +### Output layout + +Every run (local or CI) produces a fresh timestamped directory under `tests/results/`: + +``` +tests/results/2025-04-21_14-30-00/ +├── summary.txt # Human-readable key metrics — open this first +├── results.xml # JUnit XML — durations + pass/fail per test +└── metrics.json # Custom metrics keyed by test_node_id → metric_key +``` + +Live output goes to the terminal during the run. Failed assertions include the +tail of the last subprocess output inline — there is no `logs/` subdirectory. + +### `metrics.json` structure + +```json +{ + "test_liveliness.TestLiveliness.test_stable[msairsim-rob#1-iter0]": { + "airstack_up_duration_s": {"value": 42.7, "unit": "s", "direction": "lower_is_better"}, + "robot.sensors.front_stereo.left.image_rect.hz_samples": { + "samples": [{"t": 10, "value": 19.27}, {"t": 20, "value": 19.31}, ...] + }, + "airstack-robot-desktop.cpu_pct_samples": {"samples": [...]} + }, + "docker.robot-desktop": { + "image_size_mb": {"value": 8421.3, "unit": "MB", "direction": "lower_is_better"} + } +} +``` + +Keys follow `test_node_id → metric_key → {value, unit, direction, ...}`. Time-series data (Hz samples, compute snapshots) lives in `*_samples` lists; `parse_metrics.py` expands these into scalar aggregates (`mean`, `min`, `max`, `start_mean`, `end_mean`). + +### `parse_metrics.py`: single-run vs diff mode + +```bash +# Single-run report — markdown table, exits 0 always +python tests/parse_metrics.py --current tests/results/2025-04-21_14-30-00/ + +# Diff mode — side-by-side, exits 1 on regression +python tests/parse_metrics.py \ + --current tests/results/2025-04-21_14-30-00/ \ + --baseline tests/results/2025-04-20_09-00-00/ \ + --threshold 20 \ + --output report.md +``` + +The report has three sections per test module: + +- **Metrics** — flat scalar metrics (test, key, current, baseline, change%) +- **Sim publishing rates** — pivoted Hz aggregates per topic (`mean`, `start_mean`, `end_mean`, `min`, `max`) from the `sensors` mark (sim + robot streams) +- **Compute usage** — pivoted CPU/mem/GPU per container + +Regressions exceeding `--threshold` (default 20%) are flagged `:red_circle:`; improvements beyond threshold get `:green_circle:`. CI fails the job on any regression. + +When local-debugging a CI regression, download both artifacts (`test-results--` from the PR run and from the base branch's most recent run), unzip them under `tests/results/`, and run `parse_metrics.py` locally to see the same table the bot posted. + +## Adding a New System Test + +Follow this checklist when adding a new system test. + +### 1. Pick the right mark + +If your test... + +- Builds a Docker image → reuse `build_docker` +- Builds a colcon workspace → reuse `build_packages` +- Verifies the running stack → `liveliness` (infra); sensor topic rates / LiDAR / RTF → `sensors` +- Drives the autonomy stack to fly → reuse `takeoff_hover_land` +- Doesn't fit any of these → **register a new mark in `tests/pytest.ini`** before using it. Update the table in `tests/README.md` and the AGENTS.md "System Test Suite" table at the same time. + +### 2. File location and naming + +- File: `tests/test_.py` — matches pytest's default test discovery (`test_*.py`) +- Class: `Test` with the mark applied at the class level: `@pytest.mark.` +- Add a class-level `@pytest.mark.timeout()` — long-running sim tests need it +- Imports: pull helpers from `conftest` directly (`from conftest import ...`); `tests/` is on `sys.path` because `testpaths = .` in pytest.ini + +### 3. Decide if you need `airstack_env` + +- **Need full stack up (sim + robot + GCS)?** Take `airstack_env` as a fixture argument. You'll automatically be parametrized over `(sim, num_robots, iteration)` from CLI flags — `pytest_generate_tests` in conftest activates this only for tests that name the fixture. +- **Just need one container or no containers?** Don't take `airstack_env` — bring up only what you need with `airstack_cmd("up", "", env_overrides={"AUTOLAUNCH": "false"})` and tear down in a `try/finally`, the way `test_build_packages.py` does. +- **Need extra parametrization** (e.g. velocity for `takeoff_hover_land`)? Add a module-level `pytest_generate_tests(metafunc)` in your test file. Don't put it in `conftest.py` unless it applies broadly. + +### 4. Use the existing helpers + +`conftest.py` exports a deliberate API. Prefer these over rolling your own: + +| Helper | Purpose | +|--------|---------| +| `airstack_cmd(*args, env_overrides=, timeout=, log_name=)` | Run `airstack.sh` with logging tee | +| `docker_exec(container, cmd, timeout=)` | Run shell in a container, tees to current log | +| `ros2_exec(container, ros2_cmd, domain_id=, setup_bash=)` | Run `ros2 ...` with the right workspace sourced and `ROS_DOMAIN_ID` set | +| `wait_for_container(name_pattern, timeout=)` | Block until a container is Running | +| `get_robot_containers(pattern=)` | Robot containers sorted by replica index | +| `wait_for_first_message(container, topic, ...)` | Wait for one message on a topic — returns elapsed seconds | +| `sample_hz` / `parallel_sample_hz` | Sample publish rates (parallel version is far cheaper for many topics) | +| `sample_compute_usage(sim_container)` | One-shot Docker stats + nvidia-smi snapshot | +| `read_log_tail(log_name=, lines=)` | Tail the current test's log for assertion messages | + +### 5. Record metrics + +Every numeric you'd want to track across runs goes through `MetricsRecorder`: + +```python +from conftest import current_test_id, get_metrics + +m = get_metrics() +tid = current_test_id() + +# Scalar metric (most common) +m.record(tid, "altitude_error_m", 0.27, unit="m", direction="lower_is_better") + +# For "higher is better" (Hz, success rates, RTF): +m.record(tid, "trajectory_publish_hz", 50.1, unit="Hz", direction="higher_is_better") + +# Time series — parse_metrics.py auto-derives mean/start_mean/end_mean/min/max +m.record_list(tid, "robot_1.altitude_samples", + [{"t": 10, "value": 9.8}, {"t": 20, "value": 9.9}]) +``` + +Conventions: +- **Scalar key naming**: `.` for per-container/per-robot scalars (`robot_1.altitude_error_m`, `airstack-robot-desktop-1.cpu_pct`). The reporter collapses replica suffixes (`-1`, `-2`) for cross-run comparison while keeping raw per-replica data in `metrics.json`. +- **Sample series naming**: `._samples` where `` is one of `hz`, `cpu_pct`, `mem_mb`, `disk_io_mb`, `net_io_mb`, `gpu_pct`, `vram_mb`, `gpu_temp_c`, `gpu_power_w`, `realtime_factor`. These trigger the auto-aggregate logic in `parse_metrics.py` (see `SAMPLE_TYPES`). +- **Always pass `unit=`** so the report renders correctly. `direction=` defaults to `lower_is_better`. + +### 6. Fixture extension + +If multiple tests need the same setup, add a fixture in `conftest.py` (not in your test file) so it's available repo-wide. Mirror the `airstack_env` pattern: yield a dict, narrate via `logger_to(log)`, record any setup/teardown timing as metrics. + +## Common Pitfalls + +- **Forgetting `build_packages`**. If you run `-m liveliness` locally on a fresh checkout, the workspace inside the container is empty and sentinel nodes won't appear. Either run `-m "build_packages or liveliness"` or rely on the CI auto-prepend. +- **Mixing marks unintentionally**. `-m "liveliness or takeoff_hover_land"` brings the stack up once per selected mark's test classes (per parametrization). `-m "liveliness or sensors"` runs **both** classes for each tuple — **two** full ``airstack up`` / ``down`` cycles per `(sim, robots, iter)` because ``airstack_env`` is class-scoped. Combine deliberately, not by reflex. +- **Running on insufficient hardware**. `liveliness`, `sensors`, and `takeoff_hover_land` require an NVIDIA GPU plus nvidia-container-toolkit; without them the sim container won't get GPU access and topic Hz checks will time out. If you only have a CPU, scope to `-m "build_docker or build_packages"`. +- **Expecting interactive sim feedback**. `airstack_env` runs headless by default (`MS_AIRSIM_HEADLESS=true`, `ISAAC_SIM_HEADLESS=true`, `QT_QPA_PLATFORM=offscreen`). Don't add stdin prompts, GUI dialogs, or `input()` calls to test code — they will hang in CI. For local visual debugging only, pass `--gui`. +- **Not capturing metrics in a new test**. If a test fails silently (no metric recorded) the regression report has nothing to compare. Always record at least one scalar via `MetricsRecorder` so the test shows up in `metrics.json`. +- **Letting parametrize cardinality explode**. Defaults `--sim msairsim,isaacsim --num-robots 1,3` with `--stress-iterations 3` multiply stack bring-ups for each selected mark (`liveliness`, `sensors`, `takeoff_hover_land`, …) — expensive. Override locally to a single tuple while iterating. +- **Hardcoded container names**. Always use `find_container`, `get_robot_containers`, or `wait_for_container` — replica suffixes (`-1`, `-2`, `-3`) and compose project prefixes change. +- **Asserting on stdout instead of using `read_log_tail`**. The conftest tees subprocess output to per-test log files; assertions should reference those logs (`f"airstack up failed:\n{read_log_tail()}"`) so failures attach the relevant context to the JUnit XML. +- **Trying to SSH into a CI runner mid-job**. Workers are ephemeral OpenStack VMs destroyed within ~30s of job completion. Re-running the job creates a fresh VM. For genuine debugging on the runner, see `.github/orchestrator/README.md` (also exposed at `tests/ci-cd-orchestrator.md`) — but in 99% of cases, reproduce locally with `airstack test`. +- **Forgetting to register a new mark**. Adding `@pytest.mark.my_new_mark` without updating `tests/pytest.ini` produces "PytestUnknownMarkWarning" and makes `-m my_new_mark` fail to filter as expected. + +## Quick Reference + +### Common invocations + +```bash +# Cheapest possible smoke check +airstack test -m "build_docker or build_packages" -v + +# Single-config liveliness (fastest path to a real signal) +airstack test -m liveliness --sim msairsim --num-robots 1 \ + --stress-iterations 1 --stable-duration 60 -v + +# Single-config sensors (Isaac topic Hz + LiDAR; see tests/README § Isaac) +airstack test -m sensors --sim isaacsim --num-robots 1 \ + --stress-iterations 1 --stable-duration 60 -v + +# Full takeoff/hover/land sweep with three velocities +airstack test -m takeoff_hover_land --sim msairsim --num-robots 1 \ + --stress-iterations 1 --takeoff-velocities 0.5,1,2 -v + +# Local visual debug (drops headless, calls xhost +) +airstack test -m liveliness --gui -v + +# Direct pytest (debugger-friendly, no runner rebuild) +AIRSTACK_ROOT=$(pwd) pytest tests/ -m liveliness --sim msairsim -v + +# Single-run metrics report +python tests/parse_metrics.py --current tests/results// + +# Diff mode (CI behavior) +python tests/parse_metrics.py \ + --current tests/results// --baseline tests/results// \ + --threshold 20 +``` + +### `/pytest` PR comment examples + +``` +/pytest +/pytest -m liveliness --sim msairsim --num-robots 1 +/pytest -m sensors --sim isaacsim --num-robots 1 +/pytest -m takeoff_hover_land --takeoff-velocities 0.5,1 +/pytest -m "build_docker or build_packages" +``` + +(First line parsed; rest is freeform. Requires write access. Same-repo only.) + +### Mark cheat sheet + +| You want to... | Use mark expression | +|---------------|---------------------| +| Smoke-test image + workspace builds | `-m "build_docker or build_packages"` | +| Verify the stack comes up clean | `-m liveliness` | +| Verify sim + robot sensor streams (Hz, LiDAR, RTF) | `-m sensors` | +| Verify autonomy can fly the drone | `-m takeoff_hover_land` | +| Full PR validation (CI default for manual dispatch) | `-m "liveliness or takeoff_hover_land"` (CI auto-prepends `build_packages`). Add `or sensors` when you need topic-rate regression signal. | +| Run literally everything | omit `-m` | + +### Files to know + +- `tests/conftest.py` — fixtures, helpers, `MetricsRecorder`, ordering hooks +- `tests/pytest.ini` — mark registration, log format +- `tests/parse_metrics.py` — markdown reporter, regression diff +- `tests/README.md` — user-facing docs (CLI options, output layout, CI/CD orchestrator) +- `.github/workflows/system-tests.yml` — CI workflow with `/pytest` comment trigger +- `.github/orchestrator/README.md` — ephemeral OpenStack runner setup and SSH-debug procedure + +## References + +- `tests/README.md` — full user reference, CLI option matrix, CI/CD orchestrator architecture +- `.github/orchestrator/README.md` — ephemeral runner setup, debugging a failed job, SSH-into-worker +- `AGENTS.md` "System Test Suite" and "CI/CD" sections — top-level overview +- Related skills: + - [test-in-simulation](../test-in-simulation) — authoring per-module simulation scenarios (RViz, bag analysis, scene design) + - [debug-module](../debug-module) — diagnosing why a specific module misbehaves once the harness passes diff --git a/.agents/skills/use-airstack-cli/SKILL.md b/.agents/skills/use-airstack-cli/SKILL.md new file mode 100644 index 000000000..7a3a6ead2 --- /dev/null +++ b/.agents/skills/use-airstack-cli/SKILL.md @@ -0,0 +1,405 @@ +--- +name: use-airstack-cli +description: Operate AirStack via the airstack CLI and run commands inside containers using the non-interactive docker exec pattern. Use whenever you need to start/stop services, build the workspace, source the workspace, run ros2 commands, or inspect logs in any AirStack container. +license: Apache-2.0 +metadata: + author: AirLab CMU + repository: AirStack +--- + +# Skill: Use the AirStack CLI and Container Exec Patterns + +## When to Use + +Use this skill any time you need to: + +- Start, stop, or inspect AirStack services (robot, isaac-sim, ms-airsim, gcs, docs) +- Build or source the ROS 2 workspace inside a container +- Run `ros2` commands (node list, topic echo/hz, param get, launch, etc.) +- Tail or grep container logs +- Iterate on code without the autolaunch sequence taking over the container +- Run the system test suite or build the docs site + +This skill is the foundation for almost every other AirStack workflow — `debug-module`, +`test-in-simulation`, `add-ros2-package`, and `integrate-module-into-layer` all rely on +the patterns described here. + +## Why `airstack`, Not Raw `docker compose` + +Always prefer `airstack ` over `docker compose ...` directly: + +- Runs a **containerized** docker-compose pinned to a known version (consistent across + hosts and CI runners). +- Loads `.env`, propagates host env overrides, and applies the right include set from + the top-level `docker-compose.yaml` (isaac-sim / ms-airsim / robot / gcs / docs). +- Resolves Compose **profiles** (`desktop`, `isaac-sim`, `ms-airsim`, etc.) from + `COMPOSE_PROFILES` in `.env` automatically. +- Gives partial container-name matching for `connect` and `logs`. + +Drop to raw `docker` only for: `docker exec bash -c ""` (CLI does not +wrap exec), `docker logs ` for raw streams, and `docker ps` to discover +container names. + +## Container Lifecycle + +### One-time host setup + +```bash +# Install Docker Engine + NVIDIA Container Toolkit (skip if already installed) +airstack install + +# Configure AirStack: add `airstack` to PATH, set up shell completion, etc. +airstack setup +``` + +`airstack install` is only needed once per host (and only if Docker / nvidia-container-toolkit +are missing). `airstack setup` is needed once per shell user; rerun if you switch shells +(bash <-> zsh) or if `~/.airstack.conf` is missing. + +### Starting services + +The most common entrypoints: + +```bash +# Start the default profile from .env (typically: desktop + isaac-sim) +airstack up + +# Start a specific service (matches docker-compose service name) +airstack up robot-desktop +airstack up isaac-sim +airstack up ms-airsim +airstack up gcs + +# Start multiple services +airstack up isaac-sim robot-desktop +``` + +### CRITICAL: `AUTOLAUNCH=false` for development + +By default, `AUTOLAUNCH="true"` in `.env`, which means a freshly started robot or sim +container immediately runs its tmuxinator launch sequence. **For development and +debugging, you almost always want this disabled** so the container starts idle and you +can iterate on launch files, rebuild packages, and start/stop nodes by hand: + +```bash +# Start the robot container without autolaunching the autonomy stack +AUTOLAUNCH=false airstack up robot-desktop + +# Combine with other overrides +AUTOLAUNCH=false NUM_ROBOTS=2 airstack up robot-desktop isaac-sim +``` + +Any variable defined in `.env` can be overridden this way (the wrapper exports each +`.env` key into the compose container). Common ones for agents: + +| Variable | What it controls | +|------------------------------|-----------------------------------------------------------| +| `AUTOLAUNCH` | Whether the container auto-runs the launch sequence | +| `NUM_ROBOTS` | How many robot containers spawn | +| `ROBOT_NAME` | Namespace prefix for ROS topics | +| `VERSION` | Docker image tag to use | +| `COMPOSE_PROFILES` | Which compose profiles are active | +| `ISAAC_SIM_USE_STANDALONE` | Run Isaac Sim as a standalone Python script | +| `ISAAC_SIM_SCRIPT_NAME` | Which Isaac Sim launch script to run | + +### Inspecting and stopping + +```bash +# Show all running containers (with airstack container names) +airstack status + +# Tail logs for a single container (partial name matching works) +airstack logs robot-desktop +airstack logs isaac-sim + +# Stop and remove containers (clean slate) +airstack down +airstack down robot-desktop + +# Stop, remove containers, and prune volumes/networks +airstack clean +``` + +### Container naming convention + +Compose generates names of the form `--`. With the default +`PROJECT_NAME="airstack"`: `airstack-robot-desktop-1`, `airstack-isaac-sim-1`, +`airstack-ms-airsim-1`, `airstack-gcs-1`, `airstack-docs-1`. With `NUM_ROBOTS=2` you +also get `airstack-robot-desktop-2`. Always confirm with `airstack status` or +`docker ps --format '{{.Names}}'` rather than guessing. + +## Running Commands Inside Containers + +### The mandatory pattern for agents + +```bash +docker exec bash -c "" +``` + +**Never use `docker exec -it`.** Interactive mode opens a TTY, which hangs the agent +waiting for input, leaves you in the host shell after the command exits, and can't be +captured cleanly by tool-result parsing. The non-interactive `bash -c "..."` form runs, +exits, and returns stdout/stderr. + +### Examples + +```bash +# Quick health check +docker exec airstack-robot-desktop-1 bash -c "ros2 node list" + +# Multiple commands chained — sws first, then a ros2 call +docker exec airstack-robot-desktop-1 bash -c "sws && ros2 topic list | grep odom" + +# Echo a topic exactly once (will exit cleanly) +docker exec airstack-robot-desktop-1 bash -c "ros2 topic echo /robot_1/odometry --once" + +# Measure a topic's rate for a few seconds, then stop +docker exec airstack-robot-desktop-1 bash -c "timeout 5 ros2 topic hz /robot_1/odometry" +``` + +The `timeout N ` wrapper is invaluable for any `ros2` command that would otherwise +run indefinitely (`topic echo`, `topic hz`, `bag record`, `launch`). + +### `airstack connect` is for humans only + +`airstack connect ` opens an interactive shell into a container. Agents must not +call it — it opens a TTY and hangs the tool call. Use `docker exec ... bash -c "..."`. + +## Building and Sourcing + +Inside every robot/desktop container, two aliases are pre-installed: + +| Alias | Expands to | Purpose | +|-------|--------------------------------------------|----------------------------------| +| `bws` | `colcon build` with the AirStack flag set | Build the ROS 2 workspace | +| `sws` | `source install/setup.bash` | Source the workspace overlay | + +### Build the whole workspace + +```bash +docker exec airstack-robot-desktop-1 bash -c "bws" +``` + +### Build a single package (the common case during iteration) + +```bash +docker exec airstack-robot-desktop-1 bash -c "bws --packages-select my_package" +``` + +### Build with debug symbols (for GDB / valgrind) + +```bash +docker exec airstack-robot-desktop-1 bash -c \ + "bws --packages-select my_package --cmake-args '-DCMAKE_BUILD_TYPE=Debug'" +``` + +### Source the workspace before running ros2 + +`bws` builds, but the new install tree is **not** automatically on the path of a fresh +`docker exec`. Always chain `sws &&` before any `ros2 run` or `ros2 launch`: + +```bash +docker exec airstack-robot-desktop-1 bash -c "sws && ros2 launch my_package my_launch.xml" +docker exec airstack-robot-desktop-1 bash -c "sws && ros2 run my_package my_node" +``` + +### Iteration loop after editing C++ code + +```bash +# Edit on host (code is bind-mounted into the container), then rebuild + relaunch: +docker exec airstack-robot-desktop-1 bash -c "bws --packages-select my_package" +docker exec airstack-robot-desktop-1 bash -c "sws && ros2 run my_package my_node" +``` + +For Python-only changes, run `bws` once after creating the package (to install the +entry point); later edits are picked up live without rebuilding. + +## Log Inspection and Debugging + +### CLI logs (recommended) + +```bash +# Tail logs (partial name matching) +airstack logs robot-desktop +airstack logs isaac-sim +``` + +### Raw docker logs (for grep, redirection, last-N lines) + +```bash +# Last 100 lines +docker logs --tail 100 airstack-robot-desktop-1 + +# Stream and grep for errors +docker logs -f airstack-robot-desktop-1 2>&1 | grep -iE "error|fail|crash" + +# Save full log for offline analysis +docker logs airstack-robot-desktop-1 > /tmp/robot.log 2>&1 +``` + +### Per-node logs inside the container + +ROS 2 writes per-node logs under `~/.ros/log/`: + +```bash +docker exec airstack-robot-desktop-1 bash -c "ls -la ~/.ros/log/latest/" +docker exec airstack-robot-desktop-1 bash -c "tail -100 ~/.ros/log/latest/.log" +``` + +### Live container resource usage + +```bash +docker stats --no-stream airstack-robot-desktop-1 +``` + +## Other Useful Subcommands + +### Documentation + +```bash +# Build and serve the MkDocs site +airstack docs +``` + +### Tests + +The system test suite (pytest, runs against the full Docker stack) is invoked through: + +```bash +airstack test -m "build_docker or build_packages" -v +airstack test -m liveliness --sim msairsim --num-robots 1 -v +airstack test -m sensors --sim isaacsim --num-robots 1 -v # topic Hz + LiDAR (after liveliness if both selected) +airstack test -m takeoff_hover_land --sim msairsim --takeoff-velocities 0.5,1,2 -v +``` + +For full details on **pytest system tests** (fixtures, marks, `liveliness` vs +`sensors`, Isaac Hz batching, metrics, `/pytest` CI), see the +[`run-system-tests`](../run-system-tests/SKILL.md) skill and +[`tests/README.md`](../../../tests/README.md). For **authoring and running +scenarios inside Isaac Sim or AirSim** (missions, RViz checks, scene tweaks), see +the [`test-in-simulation`](../test-in-simulation/SKILL.md) skill. + +### Lint and format + +```bash +airstack lint +airstack format +``` + +### Image management + +```bash +airstack images # List AirStack images +airstack image-build # Build images locally +airstack image-push # Push to registry +airstack image-pull # Pull from registry +airstack image-delete # Remove all matching images +``` + +### Configuration helpers + +```bash +airstack config # Run all config steps +airstack config:isaac-sim # Configure Isaac Sim cache/settings +airstack config:nucleus # Configure Omniverse Nucleus credentials +airstack config:git-hooks # Install git pre-commit hooks +``` + +## Common Pitfalls + +1. **Forgetting `AUTOLAUNCH=false` during development** + - Symptom: container starts the full autonomy stack, ports are taken, you cannot + iterate on launch files. + - Fix: bring it down with `airstack down`, then `AUTOLAUNCH=false airstack up `. + +2. **Using `docker exec -it` from an agent** + - Symptom: the tool call hangs until timeout. + - Fix: always use `docker exec bash -c ""`. Add + `timeout N` in front of any long-running ROS 2 command. + +3. **Running `ros2` on the host** + - Symptom: `command not found` or a stale system ROS install responds. + - Fix: every `ros2 ...` invocation must be wrapped in + `docker exec airstack-robot-desktop-1 bash -c "..."`. Host has no AirStack overlay. + +4. **Forgetting to `sws` after `bws`** + - Symptom: `Package 'my_package' not found` or `ros2 launch` reports the launch file + missing even though the build succeeded. + - Fix: chain them: `bash -c "sws && ros2 launch my_package my_launch.xml"`. Each + `docker exec` is a fresh shell — you must source every time. + +5. **Forgetting to `bws` after editing C++** + - Symptom: behavior is unchanged after code edits; old binary still runs. + - Fix: `bws --packages-select my_package` before relaunching the node. + +6. **Guessing container names** + - Symptom: `Error: No such container: airstack-robot-1` (the actual name is + `airstack-robot-desktop-1`). + - Fix: list them with `airstack status` or + `docker ps --format '{{.Names}}'` first. + +7. **Setting overrides inside the wrong shell context** + - Symptom: `NUM_ROBOTS=2 airstack up` is run in a sub-shell that exited before + the up call — the value is lost. + - Fix: prefix the variable on the same line as `airstack up`, or `export` it first. + +8. **Using `docker compose` directly instead of `airstack up`** + - Symptom: missing profiles, missing env vars, mismatched compose-plugin version. + - Fix: always go through `airstack up`, which mounts the right project directory + and pinned compose binary. + +## Quick Reference Cheatsheet + +```bash +# ---- Lifecycle ---- +airstack install # Install Docker + nvidia-container-toolkit (one time) +airstack setup # Add airstack to PATH (one time per shell) +airstack up # Start default profile from .env +airstack up robot-desktop # Start one service +AUTOLAUNCH=false airstack up robot-desktop # Start idle (for development) — IMPORTANT +NUM_ROBOTS=2 AUTOLAUNCH=false airstack up # Multi-robot, idle +airstack status # List running containers +airstack down # Stop and remove containers +airstack clean # Stop, remove containers, prune volumes/networks +airstack logs robot-desktop # Tail logs (partial name OK) + +# ---- Exec inside container (NEVER use -it) ---- +docker exec airstack-robot-desktop-1 bash -c "ros2 node list" +docker exec airstack-robot-desktop-1 bash -c "sws && ros2 topic list" +docker exec airstack-robot-desktop-1 bash -c "timeout 5 ros2 topic hz /robot_1/odometry" +docker exec airstack-robot-desktop-1 bash -c "ros2 topic echo /robot_1/odometry --once" + +# ---- Build & source ---- +docker exec airstack-robot-desktop-1 bash -c "bws" +docker exec airstack-robot-desktop-1 bash -c "bws --packages-select my_package" +docker exec airstack-robot-desktop-1 bash -c "bws --packages-select my_package --cmake-args '-DCMAKE_BUILD_TYPE=Debug'" +docker exec airstack-robot-desktop-1 bash -c "sws && ros2 launch my_package my_launch.xml" + +# ---- Logs ---- +docker logs --tail 100 airstack-robot-desktop-1 +docker logs -f airstack-robot-desktop-1 2>&1 | grep -iE "error|fail" + +# ---- Other ---- +airstack docs # Build + serve MkDocs +airstack test -m liveliness -v # Stack infra tests +airstack test -m sensors -v # Sensor topic + LiDAR tests (Isaac batching — see tests/README) +airstack lint # Lint +airstack format # Format +``` + +## References + +- [`AGENTS.md`](../../../AGENTS.md) — sections "AirStack CLI Tool" and + "Docker Development Workflow" +- [`.airstack/README.md`](../../../.airstack/README.md) — full CLI documentation, + including module extension and troubleshooting +- [`.env`](../../../.env) — every variable that can be overridden on the + `airstack up` command line +- [`docker-compose.yaml`](../../../docker-compose.yaml) — top-level compose file + showing which sub-compose files are included +- **Related Skills:** + - [debug-module](../debug-module) — uses these exec patterns for diagnostics + - [test-in-simulation](../test-in-simulation) — uses `airstack up` and exec patterns + - [add-ros2-package](../add-ros2-package) — uses `bws --packages-select` for builds + - [integrate-module-into-layer](../integrate-module-into-layer) — uses + `AUTOLAUNCH=false` to verify launch file changes diff --git a/.agents/skills/visualize-in-foxglove/SKILL.md b/.agents/skills/visualize-in-foxglove/SKILL.md new file mode 100644 index 000000000..37071426a --- /dev/null +++ b/.agents/skills/visualize-in-foxglove/SKILL.md @@ -0,0 +1,178 @@ +--- +name: visualize-in-foxglove +description: Add visualization of a ROS 2 topic to Foxglove/GCS. Use when you want a new topic (path, markers, odometry, etc.) to appear in the Foxglove dashboard on the GCS. Covers DDS router bridging, foxglove_visualizer_node integration, and coordinate frame translation. +license: Apache-2.0 +metadata: + author: AirLab CMU + repository: AirStack +--- + +# Skill: Visualize a Topic in Foxglove / GCS + +## When to Use + +You want a topic published by a robot container to be visible in the Foxglove dashboard +running in the GCS container. + +## Architecture Overview + +``` +Robot container (domain: ROS_DOMAIN_ID) + └─ publishes topics + +DDS Router (onboard_all) + └─ bridges allowlisted topics to GCS domain + +GCS container (domain: 0) + ├─ Foxglove bridge → streams to browser + └─ foxglove_visualizer_node → transforms & republishes as /gcs/robot_markers MarkerArray +``` + +**Key insight:** A topic must appear in the DDS router allowlist AND be subscribed to +in the GCS before it will appear in Foxglove. Missing either step = nothing shows up. + +--- + +## Step 1 — Bridge the Topic in DDS Router + +**File:** `robot/ros_ws/src/autonomy_bringup/onboard_all/config/dds_router.yaml` + +Add an entry to the `allowlist` for every topic you want on the GCS: + +```yaml +allowlist: + - name: "rt/$(env ROBOT_NAME)/your/topic/here" +``` + +**Rules:** +- All ROS 2 topics must be prefixed with `rt/` (ROS Topic). +- Services use `rq/` (request) and `rr/` (reply). +- The router runs per-robot (one instance per robot container), so `$(env ROBOT_NAME)` + expands to `robot_1`, `robot_2`, etc. automatically. +- Topics are bidirectional by default. +- Without this entry the topic simply does not cross domain boundaries — the GCS node + will never see it regardless of how it subscribes. + +After editing this file, **restart the robot containers** for the change to take effect. + +--- + +## Step 2 — Subscribe and Visualize on the GCS + +There are two paths depending on what you want to display: + +### Path A — Display the raw topic directly in Foxglove + +If the topic message type is natively supported by Foxglove (e.g. `nav_msgs/Path`, +`sensor_msgs/PointCloud2`, `visualization_msgs/MarkerArray`), just bridge it and add +a panel in Foxglove pointing at the topic. No extra GCS code needed. + +**Limitation:** The topic arrives in the robot's local odom frame (`map` frame origin = +drone boot position). If you need it georeferenced (aligned with GPS/ENU), you must +translate it — see Path B. + +### Path B — Translate and republish via foxglove_visualizer_node + +**File:** `gcs/ros_ws/src/gcs_visualizer/gcs_visualizer/foxglove_visualizer_node.py` + +This node auto-discovers robot topics, applies a GPS boot offset to convert from the +robot's local odom frame to ENU (map frame), and republishes everything as a single +`/gcs/robot_markers` MarkerArray. + +Sibling visualizer nodes in the same package, useful as additional reference points: +`payload_visualizer_node.py` (gossip payload rendering), `polygon_collector_node.py`, +`waypoint_collector_node.py`. Shared frame/color helpers live in `gcs_utils.py`. + +**Coordinate frame context:** +- Robot odometry uses a local `map` frame whose origin is the drone's position at boot. +- GPS coordinates are converted to ENU relative to `ORIGIN_LAT/LON/ALT` (Lisbon by default). +- `_gps_boot[robot_name]` = ENU position of the odom origin = offset to add to all + odom-frame coordinates. +- Trajectory and global plan markers are in odom frame → add boot offset to `points`. +- Do NOT also offset `pose.position` for LINE_STRIP/ARROW markers — their points are + already in the header frame; double-offsetting the pose causes wrong positions. + +**To add a new topic type, follow this pattern (shown for `nav_msgs/Path`):** + +1. **Add suffix constant and regex pattern:** +```python +PLAN_SUFFIX = '/global_plan' +self._plan_pattern = re.compile(rf'^/({re.escape(self._prefix)}_\w+){re.escape(PLAN_SUFFIX)}$') +``` + +2. **Add state dicts and subscribed set:** +```python +self._global_plans = {} # robot_name -> latest nav_msgs/Path +self._subscribed_plan = set() +``` + +3. **Discover and subscribe in `_discover_robots`:** +```python +if topic not in self._subscribed_plan: + m = self._plan_pattern.match(topic) + if m and 'nav_msgs/msg/Path' in type_list: + name = m.group(1) + self.create_subscription( + Path, topic, + lambda msg, n=name: self._plan_callback(msg, n), + 10, # use default RELIABLE QoS for planning topics + # use SENSOR_QOS for high-rate sensor streams + ) + self._subscribed_plan.add(topic) +``` + +4. **Add callback:** +```python +def _plan_callback(self, msg: Path, robot_name: str): + self._global_plans[robot_name] = msg +``` + +5. **Render in `_publish_markers` (skip silently if not yet received):** +```python +plan = self._global_plans.get(robot_name) +if plan is not None and boot is not None: + line = Marker() + line.header.frame_id = 'map' + line.ns = f'{robot_name}_global_plan' + line.type = Marker.LINE_STRIP + ... + for pose_stamped in plan.poses: + p = pose_stamped.pose.position + line.points.append(Point(x=p.x + bx, y=p.y + by, z=p.z + bz)) + array.markers.append(line) +``` + +**QoS guidance:** +- High-rate sensor/visualization streams (odom, trajectory_vis): use `SENSOR_QOS` (BEST_EFFORT) +- Infrequently-published planning topics (global_plan): use `10` (default RELIABLE) + +--- + +## Step 3 — Verify + +```bash +# Check topic is crossing the domain bridge +docker exec airstack-robot-desktop-1 bash -c "ros2 topic echo /robot_1/your_topic --once" + +# Check GCS is receiving it +docker exec airstack-gcs-1 bash -c "ros2 topic echo /robot_1/your_topic --once" + +# Check GCS node subscribed (look for log line) +docker logs airstack-gcs-1 2>&1 | grep "Subscribed to" + +# Check the combined marker output +docker exec airstack-gcs-1 bash -c "ros2 topic echo /gcs/robot_markers --once" +``` + +--- + +## Common Pitfalls + +| Symptom | Cause | Fix | +|---------|-------|-----| +| Topic visible on robot, not on GCS | Not in dds_router allowlist | Add `rt/$(env ROBOT_NAME)/topic` to allowlist | +| Topic on GCS but not in Foxglove | Not subscribed in foxglove_visualizer_node or Foxglove panel missing | Add subscription or add panel | +| Marker appears at wrong position | Missing boot GPS offset | Apply `bx, by, bz` from `_gps_boot` to all points | +| Marker double-offset | Added boot to both `pose.position` AND `points` | Only offset `points` for LINE_STRIP/ARROW markers | +| Planning topic missed after late publish | Using BEST_EFFORT QoS | Use `10` (RELIABLE) for planning topics | +| New robot not discovered | Topic appeared before discovery timer fired | Discovery runs every 5s; wait or trigger manually | diff --git a/.agents/skills/write-isaac-sim-scene/SKILL.md b/.agents/skills/write-isaac-sim-scene/SKILL.md index 47c93ff4c..2f46c75d3 100644 --- a/.agents/skills/write-isaac-sim-scene/SKILL.md +++ b/.agents/skills/write-isaac-sim-scene/SKILL.md @@ -102,7 +102,7 @@ from pegasus.simulator.params import SIMULATION_ENVIRONMENTS, ROBOTS from pegasus.simulator.logic.interface.pegasus_interface import PegasusInterface from pegasus.simulator.ogn.api.spawn_multirotor import spawn_px4_multirotor_node from pegasus.simulator.ogn.api.spawn_zed_camera import add_zed_stereo_camera_subgraph -from pegasus.simulator.ogn.api.spawn_ouster_lidar import add_ouster_lidar_subgraph +from pegasus.simulator.ogn.api.spawn_rtx_lidar import add_rtx_lidar_subgraph from pegasus.simulator.logic.vehicles.multirotor import Multirotor, MultirotorConfig from pegasus.simulator.logic.state import State from pegasus.simulator.logic.backends.px4_mavlink_backend import ( @@ -339,9 +339,10 @@ class YourSceneApp: # Add sensors if sensors.get("camera", False): self._add_camera_sensor(vehicle) - - if sensors.get("lidar", False): - self._add_lidar_sensor(vehicle) + + # RTX LiDAR uses OmniGraph: spawn_px4_multirotor_node() returns graph_handle, + # then call self._add_lidar_sensor(vehicle, graph_handle). See + # example_one_px4_pegasus_launch_script.py for the full pattern. # Initialize vehicle in world self.world.scene.add(vehicle) @@ -362,16 +363,16 @@ class YourSceneApp: } ) - def _add_lidar_sensor(self, vehicle): - """Add LiDAR sensor to vehicle.""" - add_ouster_lidar_subgraph( - lidar_prim_path=vehicle.prim_path + "/OusterLidar", - parent_prim_path=vehicle.prim_path, - config={ - "graph_evaluator": "execution", - "position": (0.0, 0.0, -0.15), # Relative to vehicle - "orientation": (0.0, 0.0, 0.0, 1.0), - } + def _add_lidar_sensor(self, vehicle, graph_handle): + """Add RTX LiDAR (OmniGraph subgraph) to vehicle.""" + add_rtx_lidar_subgraph( + parent_graph_handle=graph_handle, + drone_prim=vehicle.prim_path, + robot_name="robot_1", + lidar_config="ouster_os1", + lidar_offset=[0.0, 0.0, 0.025], + lidar_rotation_offset=[0.0, 0.0, 0.0], + min_range=0.75, ) def run(self): @@ -520,7 +521,7 @@ Four reusable helpers that cover the most common environment setup tasks. Import | Function | When to use | |----------|-------------| -| `scale_stage_prim(stage, prim_path, scale)` | Nucleus assets authored in centimetres need `STAGE_SCALE=0.01`; assets already in metres use `1.0`. | +| `scale_stage_prim(stage, prim_path, scale)` | Nucleus assets authored in centimeters need `STAGE_SCALE=0.01`; assets already in meters use `1.0`. | | `add_colliders(stage_prim)` | **Must** be called for physics to interact with environment meshes. Without it drones fall through the floor. Call after scaling. | | `add_dome_light(stage, **kwargs)` | Adds uniform hemisphere lighting. Defaults: `intensity=3500`, `exposure=-3`. Pass kwargs to override, e.g. `add_dome_light(stage, intensity=5000)`. | | `save_scene_as_contained_usd(src_url, output_dir)` | Copies a Nucleus-hosted stage (and all its textures/MDLs) to a local directory using `omni.kit.usd.collect.Collector`. Useful for archiving or offline replay. | diff --git a/.agents/skills/write-launch-file/SKILL.md b/.agents/skills/write-launch-file/SKILL.md new file mode 100644 index 000000000..5c80bc001 --- /dev/null +++ b/.agents/skills/write-launch-file/SKILL.md @@ -0,0 +1,383 @@ +--- +name: write-launch-file +description: Author a ROS 2 launch file for AirStack with the correct conventions. Use when creating or editing any .launch.xml/.launch.py for the robot autonomy stack — covers ROBOT_NAME namespacing, topic remapping, allow_substs parameter loading, conditional launch, and layer bringup composition. +license: Apache-2.0 +metadata: + author: AirLab CMU + repository: AirStack +--- + +# Skill: Write a ROS 2 Launch File for AirStack + +## When to Use + +- Creating a launch file for a brand-new module package (paired with [add-ros2-package](../add-ros2-package)) +- Editing a layer's bringup launch file to wire in a new module (paired with [integrate-module-into-layer](../integrate-module-into-layer)) +- Adding optional/conditional behavior (e.g. `enable_rviz`, `use_alt_planner`) to an existing launch file +- Composing multiple module launch files into a higher-level launch +- Any time a module's topic names, parameters, or remappings need to change + +If you only need to tweak runtime parameters and the launch file already exists with sensible launch arguments, override via `` rather than editing the file. + +## Core Conventions (Read First) + +These are non-negotiable in AirStack: + +1. **Every robot-side topic is under `/$(env ROBOT_NAME)/...`.** Never hardcode `/drone1`, `/robot`, etc. Multi-robot scenarios depend on this. +2. **Module nodes use *relative* topic names internally** (e.g. `odometry`, `global_plan`). The launch file **remaps** them to the correct global topic. +3. **YAML config files load with `allow_substs="true"`.** Without it, `$(env ...)` and `$(var ...)` inside the YAML are not expanded. +4. **XML is the AirStack default.** Use `*.launch.xml`. Use Python (`*.launch.py`) only when you need real control flow (loops, conditional graph construction, computed values). +5. **Launch files are installed at build time.** After editing, you must rebuild the package (`bws --packages-select `) before the change takes effect. +6. **ROS 2 does NOT scope launch arguments.** A `` declared in a child launch file leaks to the parent. Prefix args with the layer/module name (e.g. `local_odometry_in_topic`) — see the warning at the top of `local.launch.xml`. + +## XML vs. Python Launch Files + +| Use XML when… | Use Python when… | +|---------------|------------------| +| Launching a fixed set of nodes | You need to loop (e.g. spawn N robots from a list) | +| Static remappings, parameter files, conditional groups | You need to compute values at launch time | +| Including other launch files | You need conditional logic that XML's `if`/`unless` cannot express | +| You want a config that is easy to diff and read | You need to read JSON/YAML and generate nodes from it | + +Almost every module launch file and every layer bringup launch file in `robot/ros_ws/src/` is XML. Defaulting to XML keeps the codebase consistent and reviewable. If you find yourself reaching for Python, first check whether environment variables + `` + `` can express the same thing. + +## Steps (Module Launch File) + +This is the typical "I just made a new package" workflow. The result is a single `/launch/.launch.xml`. + +### 1. Locate the Launch Directory + +``` +robot/ros_ws/src////launch/.launch.xml +``` + +Make sure your `CMakeLists.txt` (C++) or `setup.py` (Python) installs the `launch/` directory — see [add-ros2-package](../add-ros2-package). A launch file that is not installed does not exist as far as `ros2 launch` is concerned. + +### 2. Declare Launch Arguments + +Put every topic name, every config path, and every toggleable feature behind an `` with a sensible default. This is what lets the layer bringup file remap your module without editing your launch file. + +```xml + + + + + + + + + + + +``` + +### 3. Launch the Node with Param + Remap + +```xml + + + + + + + + + + +``` + +The `` direction always reads as: "the topic the node calls **X** in its source code should resolve to **Y** at runtime." + +### 4. (Optional) Add Conditional Sub-Components + +Anything that should only sometimes run goes in a ``: + +```xml + + + +``` + +Use `` for the inverse. For mutually exclusive alternatives, pair an `if` group with an `unless` group on the same arg. + +### 5. (Optional) Push a Sub-Namespace + +If your module spawns several supporting nodes, group them under a namespace so all of their topics get a clean prefix: + +```xml + + + ... + ... + +``` + +This is exactly the pattern `local.launch.xml` uses for the `droan` group and `px4_interface.launch.xml` uses for the `fmu` group. + +## Steps (Layer Bringup Launch File) + +A layer bringup file (e.g. `local_bringup/launch/local.launch.xml`) is a *composition* — it does not start a single node, it starts every node the layer needs and wires them together with shared topic args. + +### 1. Define Shared Topic Args at the Top + +Use prefixed argument names so they cannot collide with sibling layers' args: + +```xml + + + + + +``` + +### 2. For Each Module: Add a `` (or ``) Block + +Two valid styles: + +**Style A — direct ``** (when the module's launch file is small or you need to override most of its params): + +```xml + + + + + + +``` + +**Style B — `` the module's own launch file** (preferred when the module already exposes good launch args): + +```xml + + + +``` + +Style B keeps wiring concerns in the bringup file and parameter concerns in the module — that's the goal. + +### 3. Wire Cross-Module Topics Using AirStack Standard Names + +Use the canonical names from AGENTS.md → "Standard Topic Patterns": + +| Topic | Where it comes from | Where it goes to | +|-------|---------------------|------------------| +| `/$(env ROBOT_NAME)/odometry_conversion/odometry` | `px4_interface` (or other interface) | every consumer (planners, controllers) | +| `/$(env ROBOT_NAME)/global_plan` | global planner | local planner | +| `/$(env ROBOT_NAME)/trajectory_controller/trajectory_segment_to_add` | local planner | trajectory controller | +| `/$(env ROBOT_NAME)/trajectory_controller/look_ahead` | trajectory controller | local planner | +| `/$(env ROBOT_NAME)/trajectory_controller/tracking_point` | trajectory controller | controllers, action servers | +| `/$(env ROBOT_NAME)/trajectory_controller/trajectory_override` | takeoff/land/fixed-traj action servers | trajectory controller | +| `/$(env ROBOT_NAME)/tasks/` | behavior tree / GCS | task executor action server | + +### 4. Add the Module to the Bringup `package.xml` + +`ros2 launch` does not need this, but `colcon build --packages-up-to ` and the install dependency tracking do. Add `my_planner` to `/package.xml`. See [integrate-module-into-layer](../integrate-module-into-layer) step 5. + +### 5. Rebuild + +```bash +docker exec airstack-robot-desktop-1 bash -c "bws --packages-select my_planner" +``` + +## Loading YAML Config Files + +```xml + +``` + +`allow_substs="true"` enables `$(env VAR)` and `$(var arg)` substitution **inside the YAML file itself**. Without it, a YAML line like: + +```yaml +/**: + ros__parameters: + frame_id: $(env ROBOT_NAME)/base_link +``` + +…will load the literal string `"$(env ROBOT_NAME)/base_link"` and downstream code will fail in confusing ways. **When in doubt, set `allow_substs="true"` — there is no downside.** + +YAML structure for ROS 2 parameters (this is what your `config/.yaml` should look like): + +```yaml +/**: + ros__parameters: + update_rate: 10.0 + target_frame: map + enable_visualization: true +``` + +The `/**:` wildcard matches any node name, which is the most portable form when the launch file may rename the node. + +## Including Child Launch Files + +```xml + + + + +``` + +Notes: +- Use `$(find-pkg-share )` to locate launch files — never hardcode absolute paths. +- `` runs the child launch file in the parent's argument scope (remember: ROS 2 does NOT scope args). Set every argument you depend on explicitly. +- You can include `.launch.xml` from a `.launch.xml`, and either format from a `.launch.py`. + +## Conditional Launch Patterns + +### Toggle a node on/off + +```xml + + + + + +``` + +### Pick one of two implementations + +```xml + + + + ... + + + ... + +``` + +### Read a default from an env var + +```xml + +``` + +The second positional value to `$(env VAR default)` is the fallback when the env var is unset. + +## Common Pitfalls + +This list also appears in AGENTS.md → "Critical Pitfalls #5"; this is the expanded version. + +- **Hardcoded topic names in node code.** + - Bad: `create_subscription("/drone1/odometry", ...)` + - Good: `create_subscription("odometry", ...)` and remap in the launch file. +- **Hardcoded topic prefixes in launch files.** + - Bad: `` + - Good: `` +- **Forgetting `allow_substs="true"` on a ``.** YAML substitutions silently fail to expand. Symptom: parameters look like literal `$(env ROBOT_NAME)` strings at runtime. +- **`ROBOT_NAME` env var not set.** All `$(env ROBOT_NAME)` substitutions become an empty string, producing topic names like `//odometry`. `ROBOT_NAME` is normally resolved at container shell startup by `robot/docker/.bashrc` via `robot/docker/robot_name_map/resolve_robot_name.py` — see [configure-multi-robot](../configure-multi-robot). For ad-hoc overrides, pass `-e ROBOT_NAME=robot_X` to `docker exec`. +- **`` name collisions across layers.** Two child launch files that both define `` will silently fight. Always prefix: `local_odometry_in_topic`, `global_odometry_in_topic`, etc. +- **Missing `install(DIRECTORY launch …)` in CMakeLists.txt.** The launch file builds fine but `ros2 launch` cannot find it. Run `ls install//share//launch/` to verify after build. +- **Editing a launch file but not rebuilding.** Launch files are *installed* by `colcon build`. The source `launch/` directory is NOT what `ros2 launch` reads. Always rebuild after editing. +- **Forgetting `output="screen"`.** The node runs but its logs go to a file you have to hunt for. Use `output="screen"` on every node during development. +- **Wrong `` direction.** `from` is what the *node's source code* calls the topic; `to` is what it should resolve to at runtime. Swapping these is one of the most common silent failures. +- **Using `~/topic` without understanding it.** `~/foo` resolves to `/foo`. Useful for action server private namespaces (see `random_walk` and `takeoff_landing_planner` examples) but confusing if you don't expect it. +- **Pushing a namespace and then absolute-path remapping anyway.** `` only affects *relative* topic names. `` ignores the pushed namespace entirely. That's usually what you want, but be aware of the interaction. + +## Verification + +After writing or editing the launch file, run through this checklist: + +```bash +# 1. Rebuild +docker exec airstack-robot-desktop-1 bash -c "bws --packages-select " + +# 2. Confirm the file was installed +docker exec airstack-robot-desktop-1 bash -c "ls install//share//launch/" + +# 3. Dry-run launch to catch XML/syntax errors +docker exec airstack-robot-desktop-1 bash -c "sws && ros2 launch .launch.xml --print" + +# 4. Actually launch and watch logs +docker exec airstack-robot-desktop-1 bash -c "sws && ros2 launch .launch.xml" + +# 5. Verify the node and its topics resolved correctly +docker exec airstack-robot-desktop-1 bash -c "ros2 node list" +docker exec airstack-robot-desktop-1 bash -c "ros2 node info ///" +docker exec airstack-robot-desktop-1 bash -c "ros2 topic info //" +``` + +If `ros2 node info` shows a node subscribing to `/odometry` instead of `//odometry_conversion/odometry`, your remap is wrong or `ROBOT_NAME` is unset. + +## Skeleton Template + +Copy-paste this into `robot/ros_ws/src////launch/.launch.xml` and replace the `MY_*` / `my_*` tokens. + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +## References + +- **Real launch files to study:** + - Module: `robot/ros_ws/src/interface/px4_interface/launch/px4_interface.launch.xml` + - Module: `robot/ros_ws/src/local/planners/droan_local_planner/launch/droan_local_planner.launch.xml` + - Layer bringup (global): `robot/ros_ws/src/global/global_bringup/launch/global.launch.xml` + - Layer bringup (local): `robot/ros_ws/src/local/local_bringup/launch/local.launch.xml` + - Template: `.agents/skills/add-ros2-package/assets/package_template/launch/template.launch.xml` + +- **ROS 2 docs:** + - [ROS 2 Launch Tutorials](https://docs.ros.org/en/jazzy/Tutorials/Intermediate/Launch/Launch-Main.html) + - [Using ROS 2 Launch For Large Projects](https://docs.ros.org/en/jazzy/Tutorials/Intermediate/Launch/Using-ROS2-Launch-For-Large-Projects.html) + - [Launch XML format reference](https://design.ros2.org/articles/roslaunch_xml.html) + +- **Related skills:** + - [add-ros2-package](../add-ros2-package) — full package creation flow that includes a launch file + - [integrate-module-into-layer](../integrate-module-into-layer) — wiring your launch file into a layer bringup + - [debug-module](../debug-module) — diagnosing launch and topic issues + - [test-in-simulation](../test-in-simulation) — verifying the launched stack end-to-end diff --git a/.agents/skills/write-mkdocs-documentation/SKILL.md b/.agents/skills/write-mkdocs-documentation/SKILL.md index 4fd7844b4..b46f5fde1 100644 --- a/.agents/skills/write-mkdocs-documentation/SKILL.md +++ b/.agents/skills/write-mkdocs-documentation/SKILL.md @@ -528,7 +528,7 @@ nav: - docs/development/index.md - Beginner Tutorials: - docs/development/beginner/key_concepts.md - - docs/development/beginner/environment_setup.md + - docs/development/beginner/development_environment.md - Intermediate Tutorials: - docs/development/intermediate/testing/index.md ``` diff --git a/.airstack/modules/config.sh b/.airstack/modules/config.sh index cff26995a..1c908756a 100755 --- a/.airstack/modules/config.sh +++ b/.airstack/modules/config.sh @@ -60,9 +60,15 @@ function cmd_config_nucleus { read -r -p "API Token: " API_TOKEN if [ ! -z "${API_TOKEN}" ]; then - sed "s/PASTE-YOUR-API-TOKEN/$API_TOKEN/g" "$OMNI_PASS_SOURCE" > "$OMNI_PASS_DESTINATION" + local escaped_api_token="${API_TOKEN//\\/\\\\}" + escaped_api_token="${escaped_api_token//&/\\&}" + escaped_api_token="${escaped_api_token//\//\\/}" + sed -e "s/^OMNI_USER=.*$/OMNI_USER='\$omni-api-token'/" \ + -e "s/^OMNI_PASS=.*$/OMNI_PASS=$escaped_api_token/" \ + "$OMNI_PASS_SOURCE" > "$OMNI_PASS_DESTINATION" log_info "Nucleus login configuration complete" else + cp "$OMNI_PASS_SOURCE" "$OMNI_PASS_DESTINATION" log_info "Skipping Nucleus login configuration" fi } diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh new file mode 100755 index 000000000..053decbee --- /dev/null +++ b/.airstack/modules/osmo.sh @@ -0,0 +1,719 @@ +#!/usr/bin/env bash + +# osmo.sh — AirStack-on-OSMO convenience commands. +# +# Wraps `osmo workflow submit/port-forward/logs/cancel` for the +# osmo/workflows/airstack-dev.yaml workflow so a Mac/Windows student doesn't +# have to memorize the WebRTC port range or the entry-script path. +# +# This module is pure bash + the cross-platform `osmo` CLI — no Docker +# dependency. Safe to run on a laptop with no AirStack runtime. +# +# Most commands need a workflow id. `osmo:up` saves the id to +# $OSMO_STATE_FILE; the other commands read it from there. You can also +# override it for a single invocation by exporting AIRSTACK_OSMO_WF. + +# State directory and file: ~/.airstack/osmo-state stores the most recent +# workflow id submitted with `airstack osmo:up`. +OSMO_STATE_DIR="${HOME}/.airstack" +OSMO_STATE_FILE="${OSMO_STATE_DIR}/osmo-state" + +# WebRTC livestream ports — must match the ports published by the +# isaac-sim-livestream service in +# simulation/isaac-sim/docker/docker-compose.yaml AND the +# app.livestream.fixedHostPort setting pinned in the Pegasus launch script +# (simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py). +# +# Two ports total: +# TCP 49100 — omni.kit.livestream.webrtc WebSocket signaling +# UDP 49099 — SRTP media (pinned; Kit 107 otherwise picks dynamically and +# escapes both the compose-published and CLI-forwarded ranges) +OSMO_WEBRTC_TCP="49100" +OSMO_WEBRTC_UDP="49099" + +# GCS Foxglove websocket: container 8765 → host 8766 (per +# gcs/docker/docker-compose.yaml). +OSMO_FOXGLOVE_PORT="8766:8766" + +# SSH port-forward: local 2200 → pod 22. +OSMO_SSH_PORT="2200:22" + +# Default `osmo workflow port-forward` connect-timeout (24h). +OSMO_PF_TIMEOUT="${OSMO_PF_TIMEOUT:-86400}" + +# Helper: ensure the osmo CLI is on PATH. +function _osmo_check_cli { + if ! command -v osmo >/dev/null 2>&1; then + log_error "osmo CLI not found on PATH. Install from https://github.com/NVIDIA/OSMO and run 'osmo login'." + return 1 + fi +} + +# Helper: strip leading/trailing whitespace + CR/NUL bytes from the +# variable named in $1. +# +# Why this exists: bracket-paste mode and cross-OS clipboards (RDP, VNC, +# Windows-side note apps) routinely smuggle invisible bytes around long +# pastes — Nucleus API tokens (JWT, ~1 KB) and SSH keys are the usual +# victims. Nucleus's auth endpoint silently `DENIES` a token that has +# one extra trailing byte, with no actionable error from the client side. +# Stripping defensively at prompt time saves an entire round-trip of +# "regenerate token → still denied → check auth-service log" debugging. +function _osmo_trim { + local var_name="$1" + local val="${!var_name}" + local original_len="${#val}" + val="${val//$'\r'/}" + val="${val//$'\0'/}" + val="${val#"${val%%[![:space:]]*}"}" + val="${val%"${val##*[![:space:]]}"}" + if [ "${#val}" -ne "$original_len" ]; then + log_warn "Stripped $((original_len - ${#val})) whitespace/control byte(s) from ${var_name}." + fi + printf -v "$var_name" '%s' "$val" +} + +# Helper: read a value with prompt; supports -s for silent (passwords). +# +# Visible prompts switch the TTY out of canonical mode for the duration of +# the read. Without this, macOS caps each input line at MAX_CANON = 1024 +# bytes (per ) and rings the terminal bell on Enter when +# the buffer overflows. Nucleus API tokens are JWTs ~950 bytes long, so +# `Nucleus API token: ` lands right at the cap. `stty -icanon` makes +# the kernel deliver bytes to bash as they're typed, with no line-buffer +# limit; bash's `read` still terminates on newline normally. +# +# We use a trap to guarantee the saved stty is restored if the user Ctrl-Cs +# mid-paste — otherwise the shell would be left in raw mode. +# +# After reading we always run _osmo_trim — see comment there. +function _osmo_prompt { + local var_name="$1" + local prompt_text="$2" + local silent="${3:-false}" + local saved_stty="" + + if [ "$silent" = "true" ]; then + # Passwords are short — canonical-mode cap is fine here. + read -r -s -p "${prompt_text}: " "$var_name" + printf "\n" >&2 + else + if [ -t 0 ]; then + saved_stty="$(stty -g 2>/dev/null || true)" + if [ -n "$saved_stty" ]; then + trap 'stty "$saved_stty" 2>/dev/null; trap - INT' INT + stty -icanon 2>/dev/null + fi + fi + read -r -p "${prompt_text}: " "$var_name" + if [ -n "$saved_stty" ]; then + stty "$saved_stty" 2>/dev/null + trap - INT + fi + fi + + _osmo_trim "$var_name" + + if [ -z "${!var_name}" ]; then + log_error "Empty input for ${var_name}; aborting." + return 1 + fi +} + +# osmo:setup — interactively register the three OSMO credentials AirStack +# needs (airlab-docker-registry, airlab-docker-login, airlab-nucleus). +# Idempotent — re-running rotates the credentials. +function cmd_osmo_setup { + _osmo_check_cli || return 1 + + cat >&2 <<'EOF' + +This sets up the three per-user OSMO credentials AirStack-on-OSMO needs: + + 1. airlab-docker-registry (REGISTRY) — for OSMO to pull the workspace image + 2. airlab-docker-login (GENERIC) — for the inner dockerd to pull AirStack images + 3. airlab-nucleus (GENERIC) — for Isaac Sim Nucleus access + +You'll be asked for: + + - your Andrew ID (no @andrew.cmu.edu suffix) + - your AirLab Docker password (same as your Andrew password) + - your Nucleus API token (https://airlab-nucleus.andrew.cmu.edu/omni/web3/ + → right-click cloud → API Tokens). NOT your Andrew password. + +Values go directly to OSMO; nothing is written to disk locally. + +EOF + + local andrew_id andrew_password nucleus_token + _osmo_prompt andrew_id "Andrew ID" false || return 1 + _osmo_prompt andrew_password "AirLab Docker password (hidden)" true || return 1 + _osmo_prompt nucleus_token "Nucleus API token" false || return 1 + + # Sanity-check the Nucleus token shape. Nucleus issues RS256 JWTs: + # base64url(header).base64url(payload).base64url(signature), with the + # header always starting `eyJ` (base64url of `{"`). Catching a wrong + # paste here (e.g. Andrew password, or token without the trailing + # signature segment) saves the user from a silent `InternalCredentials + # .auth: DENIED` round-trip later on. We do not validate the signature. + case "$nucleus_token" in + eyJ*.*.*) ;; # looks like a 3-segment JWT + *) + log_error "That doesn't look like a Nucleus API token." + log_error " - Expected: a JWT of the form eyJ…… (~1 KB long)" + log_error " - Got: ${#nucleus_token} chars, prefix '$(printf '%s' "$nucleus_token" | head -c 8)…'" + log_error " Generate one at https://airlab-nucleus.andrew.cmu.edu/omni/web3/" + log_error " → right-click cloud icon → API Tokens → Create." + return 1 + ;; + esac + + local omni_server="${OMNI_SERVER:-omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1}" + local airlab_registry="${AIRLAB_REGISTRY:-airlab-docker.andrew.cmu.edu}" + + # `osmo credential set` is NOT an upsert for GENERIC credentials — re-setting + # one that already exists fails with `400 duplicate key value violates unique + # constraint "credential_pkey"`. Delete first so re-running osmo:setup + # (e.g. to rotate a Nucleus token) is idempotent. The `|| true` swallows the + # "credential not found" case on a first-time run. + log_info "Refreshing airlab-docker-registry (REGISTRY)..." + osmo credential delete airlab-docker-registry >/dev/null 2>&1 || true + osmo credential set airlab-docker-registry \ + --type REGISTRY \ + --payload "registry=${airlab_registry}" \ + "username=${andrew_id}" \ + "auth=${andrew_password}" \ + || { log_error "osmo credential set airlab-docker-registry failed"; return 1; } + + log_info "Refreshing airlab-docker-login (GENERIC)..." + osmo credential delete airlab-docker-login >/dev/null 2>&1 || true + osmo credential set airlab-docker-login \ + --type GENERIC \ + --payload "username=${andrew_id}" \ + "password=${andrew_password}" \ + || { log_error "osmo credential set airlab-docker-login failed"; return 1; } + + log_info "Refreshing airlab-nucleus (GENERIC)..." + osmo credential delete airlab-nucleus >/dev/null 2>&1 || true + osmo credential set airlab-nucleus \ + --type GENERIC \ + --payload "omni_user=${andrew_id}" \ + "omni_pass=${nucleus_token}" \ + "omni_server=${omni_server}" \ + || { log_error "osmo credential set airlab-nucleus failed"; return 1; } + + log_info "All three credentials registered. List them with: osmo credential list" + log_info "Next: airstack osmo:up [--pool POOL]" +} + +# Helper: pick the first existing SSH public key on the host. +function _osmo_pick_pubkey { + local candidates=( + "${HOME}/.ssh/id_ed25519.pub" + "${HOME}/.ssh/id_ecdsa.pub" + "${HOME}/.ssh/id_rsa.pub" + ) + for k in "${candidates[@]}"; do + if [ -f "$k" ]; then + echo "$k" + return 0 + fi + done + return 1 +} + +# Helper: get the active workflow id (env override first, then state file). +# +# The state file persists across shell sessions, so it can easily go stale +# (e.g. a previous airstack-dev-N is now FAILED/CANCELED). To avoid the +# confusing "Workflow airstack-dev-10 is not running!" 410 error from the +# downstream osmo command, this helper verifies the saved id is still in a +# live state (PENDING / RUNNING) before returning it. +function _osmo_wf_id { + local wf + if [ -n "${AIRSTACK_OSMO_WF:-}" ]; then + wf="${AIRSTACK_OSMO_WF}" + elif [ -f "${OSMO_STATE_FILE}" ]; then + wf="$(cat "${OSMO_STATE_FILE}")" + else + log_error "No workflow id found. Run 'airstack osmo:up' first, or export AIRSTACK_OSMO_WF=." + return 1 + fi + + # Validate the workflow is still alive (only when osmo CLI is available). + if command -v osmo >/dev/null 2>&1; then + local status + status="$(osmo workflow query "${wf}" 2>/dev/null | awk -F': +' '/^Status/ {print $2; exit}' | tr -d ' \r\n')" + case "${status}" in + PENDING|RUNNING|"") + # "" means we couldn't reach osmo; let the downstream + # command surface the real error rather than failing here. + ;; + *) + log_error "Saved workflow '${wf}' is ${status}, not running." + log_warn "Run 'airstack osmo:up' to launch a fresh one, or:" + log_warn " rm ${OSMO_STATE_FILE}" + log_warn " export AIRSTACK_OSMO_WF=" + return 1 + ;; + esac + fi + + echo "${wf}" + return 0 +} + +# Helper: persist the workflow id. +function _osmo_save_wf_id { + mkdir -p "${OSMO_STATE_DIR}" + echo "$1" > "${OSMO_STATE_FILE}" + log_info "Saved workflow id '$1' to ${OSMO_STATE_FILE}" +} + +# Helper: best-effort detection of the user's current AirStack branch so +# `airstack osmo:up` can default --branch to whatever the user is editing +# locally. Returns the branch name on stdout, or empty if we shouldn't +# auto-pin (detached HEAD, not a git repo, etc.). +# +# Why default to the local branch: the pod's entrypoint clones AirStack +# fresh from GitHub on every workflow start (the pod fs is ephemeral, so +# nothing else makes sense). If we don't tell it which branch, it +# defaults to `main` — and any developer testing branch-only OSMO +# changes (compose services, entrypoint tweaks, workflow yaml edits) +# silently runs against stale `main` code instead of their work. +# Defaulting to the local branch makes "edit on laptop, push, osmo:up" +# the natural workflow. +function _osmo_local_branch { + if ! command -v git >/dev/null 2>&1; then + return 0 + fi + local b + b="$(git -C "${PROJECT_ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null)" || return 0 + case "$b" in + ""|HEAD) return 0 ;; # detached HEAD or empty + esac + echo "$b" +} + +# Helper: warn if the about-to-submit branch isn't safely pushed. The +# pod clones from GitHub, so unpushed commits / dirty working tree don't +# make it into the pod even if the user thinks they did. Catching this +# before submit avoids a 60-90s "wait for pod, then realize" round trip. +function _osmo_check_branch_pushed { + local branch="$1" + command -v git >/dev/null 2>&1 || return 0 + local repo="${PROJECT_ROOT}" + [ -d "${repo}/.git" ] || return 0 + + local local_sha upstream_sha + local_sha="$(git -C "$repo" rev-parse "${branch}" 2>/dev/null)" || return 0 + + # Look for a remote-tracking branch first (the explicit upstream + # set by `git push -u`); fall back to origin/. + upstream_sha="$(git -C "$repo" rev-parse "${branch}@{upstream}" 2>/dev/null)" + if [ -z "$upstream_sha" ]; then + upstream_sha="$(git -C "$repo" rev-parse "origin/${branch}" 2>/dev/null)" + fi + + if [ -z "$upstream_sha" ]; then + log_warn "Branch '${branch}' has no upstream on origin — the pod's clone will fail. Run: git push -u origin ${branch}" + return 0 + fi + + if [ "$local_sha" != "$upstream_sha" ]; then + local ahead behind + ahead="$(git -C "$repo" rev-list --count "${upstream_sha}..${local_sha}" 2>/dev/null)" + behind="$(git -C "$repo" rev-list --count "${local_sha}..${upstream_sha}" 2>/dev/null)" + if [ "${ahead:-0}" -gt 0 ]; then + log_warn "Local '${branch}' is ${ahead} commit(s) ahead of origin/${branch} — the pod will clone the older origin tip. Run: git push" + fi + if [ "${behind:-0}" -gt 0 ]; then + log_info "Local '${branch}' is ${behind} commit(s) behind origin/${branch} (pod will clone the newer origin tip)." + fi + fi + + if [ -n "$(git -C "$repo" status --porcelain 2>/dev/null)" ]; then + log_warn "Working tree has uncommitted changes — the pod will not see them. Commit + push first if you want the pod to pick them up." + fi +} + +# osmo:up — submit airstack-dev.yaml with the local pubkey injected. +# +# Usage: airstack osmo:up [--pool POOL] [--key PATH] [--branch BRANCH] +# +# --branch defaults to the local repo's current branch (or `main` if we +# can't detect one), and is passed through as AIRSTACK_BRANCH so the +# pod's entrypoint clones the matching code. Pass `--branch main` +# explicitly to override. +function cmd_osmo_up { + _osmo_check_cli || return 1 + + local pool="${OSMO_POOL:-}" + local pubkey_file="" + local branch="" + local branch_explicit=false + local extra_args=() + + while [ $# -gt 0 ]; do + case "$1" in + --pool) pool="$2"; shift 2 ;; + --key) pubkey_file="$2"; shift 2 ;; + --branch) branch="$2"; branch_explicit=true; shift 2 ;; + *) extra_args+=("$1"); shift ;; + esac + done + + if [ -z "$pubkey_file" ]; then + if ! pubkey_file="$(_osmo_pick_pubkey)"; then + log_error "No SSH public key found in ~/.ssh. Generate one with: ssh-keygen -t ed25519" + return 1 + fi + fi + log_info "Using SSH public key: ${pubkey_file}" + + local workflow_yaml="${PROJECT_ROOT}/osmo/workflows/airstack-dev.yaml" + if [ ! -f "$workflow_yaml" ]; then + log_error "Workflow file not found: ${workflow_yaml}" + return 1 + fi + + # Auto-pin --branch to the local checkout if the user didn't pass one. + if [ "$branch_explicit" = false ] && [ -z "$branch" ]; then + branch="$(_osmo_local_branch)" + if [ -n "$branch" ]; then + log_info "Auto-detected local branch '${branch}'; pod will clone from origin/${branch} (override with --branch main)." + else + log_info "Could not detect local branch (detached HEAD?); pod will clone from origin/main." + fi + fi + if [ -n "$branch" ]; then + _osmo_check_branch_pushed "$branch" + fi + + local cmd=(osmo workflow submit "$workflow_yaml") + if [ -n "$pool" ]; then + cmd+=(--pool "$pool") + else + log_warn "No --pool provided and OSMO_POOL is unset; using your osmo profile's default pool." + fi + # IMPORTANT: `osmo workflow submit --set-env` is variadic. Passing two + # separate `--set-env A=1 --set-env B=2` silently drops the first one + # (only the last `--set-env` flag's values are kept). We collect all + # K=V pairs and pass them under a single `--set-env`. + local env_kvs=("SSH_PUB_KEY=$(cat "$pubkey_file")") + if [ -n "$branch" ]; then + env_kvs+=("AIRSTACK_BRANCH=${branch}") + fi + cmd+=(--set-env "${env_kvs[@]}") + if [ ${#extra_args[@]} -gt 0 ]; then + cmd+=("${extra_args[@]}") + fi + + log_info "Submitting: ${cmd[*]}" + local output + if ! output="$("${cmd[@]}" 2>&1)"; then + echo "$output" >&2 + log_error "osmo workflow submit failed." + return 1 + fi + echo "$output" + + # Parse the workflow id out of the submit output. The cookbook examples + # show "Workflow ID - " formatted output (see OSMO + # submission.rst). Match that line. + local wf_id + wf_id="$(echo "$output" | awk -F'- ' '/^Workflow ID/ {print $2; exit}' | tr -d ' \r\n')" + if [ -z "$wf_id" ]; then + log_warn "Could not parse workflow id from submit output. Set it manually:" + log_warn " echo > ${OSMO_STATE_FILE}" + return 0 + fi + _osmo_save_wf_id "$wf_id" + + log_info "Next steps:" + log_info " airstack osmo:logs # follow startup until 'sshd listening'" + log_info " airstack osmo:ide # port-forward sshd + open VS Code" + log_info " airstack osmo:webrtc # forward Isaac Sim WebRTC ports" + log_info " airstack osmo:foxglove # forward GCS Foxglove websocket" + log_info " airstack osmo:down # cancel the workflow" +} + +# osmo:logs — follow the workspace task logs. +# +# Despite the `osmo workflow logs --help` output advertising only `-n +# LAST_N_LINES` (no `--follow`), the CLI in fact streams the tail and keeps +# the connection open as new lines arrive — i.e. it already behaves like +# `tail -f`. We just exec it in the foreground so the user sees output +# immediately and can Ctrl+C to stop. (An earlier implementation wrapped +# this in `out=$(osmo workflow logs ...)`; command substitution waits for +# the process to exit, which never happened, so nothing was ever printed.) +function cmd_osmo_logs { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + local task="${OSMO_LOGS_TASK:-workspace}" + local lines="${OSMO_LOGS_TAIL:-500}" + + log_info "Following ${task} logs for ${wf} (last ${lines} lines, then live; Ctrl+C to stop)" + + # Filter stderr for the same OSMOUserError-when-workflow-dies case + # the port-forward path hits — same noisy asyncio Traceback + + # "Task exception was never retrieved" header. _osmo_pf_filter + # collapses it into one clean log line. + osmo workflow logs "${wf}" -t "${task}" -n "${lines}" \ + 2> >(_osmo_pf_filter "${wf}") +} + +# osmo:ide — port-forward sshd + (optionally) launch VS Code/Cursor on the +# `airstack-osmo` host. Runs the port-forward in the foreground so closing +# the terminal closes the tunnel. +# +# Usage: airstack osmo:ide [--no-open] [code|cursor] +function cmd_osmo_ide { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + local open_ide=true + local ide_cmd="" + while [ $# -gt 0 ]; do + case "$1" in + --no-open) open_ide=false; shift ;; + code|cursor) ide_cmd="$1"; shift ;; + *) log_warn "Ignoring unknown osmo:ide arg: $1"; shift ;; + esac + done + + if [ -z "$ide_cmd" ]; then + if command -v cursor >/dev/null 2>&1; then + ide_cmd="cursor" + elif command -v code >/dev/null 2>&1; then + ide_cmd="code" + else + log_warn "Neither 'cursor' nor 'code' found on PATH; will only port-forward (open the IDE manually and Connect to Host airstack-osmo)." + open_ide=false + fi + fi + + log_info "Make sure ~/.ssh/config has a 'Host airstack-osmo' entry pointing at localhost:2200, User root." + + # Local TCP port the user's IDE will connect to (the local side of the + # `--port LOCAL:REMOTE` mapping). + local local_port="${OSMO_SSH_PORT%%:*}" + + # Every fresh OSMO pod ships a new sshd host key. If the user's + # ~/.ssh/known_hosts still has an entry for [localhost]:${local_port} + # from a previous workflow, ssh aborts with "Host key for [localhost] + # :${local_port} has changed and you have requested strict checking", + # which the IDE surfaces as a generic "could not connect" error. + # + # The recommended ~/.ssh/config block for `airstack-osmo` uses + # `UserKnownHostsFile /dev/null`, which sidesteps this entirely — but + # users who set up before that change still have a stale entry on + # disk. Scrub it defensively on every osmo:ide invocation. ssh-keygen + # -R is idempotent: a no-op if the entry doesn't exist. + if command -v ssh-keygen >/dev/null 2>&1; then + ssh-keygen -R "[localhost]:${local_port}" >/dev/null 2>&1 || true + fi + + # Reuse an existing forward if one is already listening (the user might + # have run this from a second terminal, or osmo:foxglove already opened + # a multi-port forward). Otherwise spawn one in the background and wait + # for it to bind before launching the IDE — this avoids the race where + # Cursor/VS Code tries to SSH before the tunnel exists and dies with + # "connect to host localhost port 2200: Connection refused". + local pf_pid="" + if nc -z localhost "$local_port" 2>/dev/null; then + log_info "Port ${local_port} is already listening; reusing existing port-forward." + else + log_info "osmo workflow port-forward ${wf} workspace --port ${OSMO_SSH_PORT} --connect-timeout ${OSMO_PF_TIMEOUT}" + osmo workflow port-forward "$wf" workspace --port "$OSMO_SSH_PORT" --connect-timeout "$OSMO_PF_TIMEOUT" \ + > "${OSMO_STATE_DIR}/ssh-pf.log" 2>&1 & + pf_pid=$! + # Wait up to 30s for the tunnel to start accepting connections. + local waited=0 + until nc -z localhost "$local_port" 2>/dev/null; do + sleep 1; waited=$((waited+1)) + if [ "$waited" -ge 30 ]; then + log_error "Timed out waiting for port-forward on :${local_port} after ${waited}s." + log_error " port-forward log: ${OSMO_STATE_DIR}/ssh-pf.log" + kill "$pf_pid" 2>/dev/null + return 1 + fi + if ! kill -0 "$pf_pid" 2>/dev/null; then + log_error "port-forward exited early. Tail:" + tail -10 "${OSMO_STATE_DIR}/ssh-pf.log" >&2 + return 1 + fi + done + log_info "Port-forward established on localhost:${local_port} (pid ${pf_pid})." + fi + + if [ "$open_ide" = true ]; then + # vscode-remote URI launches the IDE pre-attached to the remote host. + local uri="vscode-remote://ssh-remote+airstack-osmo/root/AirStack" + log_info "Launching ${ide_cmd} → ${uri}" + ( "$ide_cmd" --folder-uri "$uri" >/dev/null 2>&1 || \ + "$ide_cmd" "$uri" >/dev/null 2>&1 || \ + log_warn "Could not launch ${ide_cmd} automatically; open it and pick airstack-osmo from Remote-SSH manually." ) & + fi + + if [ -n "$pf_pid" ]; then + log_info "Leave this terminal running for the length of your session (Ctrl+C to disconnect)." + # Forward Ctrl+C to the port-forward and clean up. + trap 'kill "$pf_pid" 2>/dev/null; exit 0' INT TERM + wait "$pf_pid" + else + log_info "Existing port-forward owns the tunnel; this command will exit immediately." + log_info "Stop the tunnel with: pkill -f 'osmo workflow port-forward' or airstack osmo:down" + fi +} + +# Helper: filter `osmo workflow port-forward` stderr through awk to +# suppress the asyncio traceback that erupts whenever the workflow gets +# canceled mid-flight (e.g. via osmo:down in another shell, or because +# OSMO timed it out). The CLI raises OSMOUserError("Workflow X is not +# running!") from inside an asyncio Task, which then prints "Task +# exception was never retrieved" + a multi-line Traceback that obscures +# the actual one-line cause. We translate that into a single clean log +# line and drop everything else. +function _osmo_pf_filter { + local wf="$1" + awk -v WF="$wf" ' + /^Task exception was never retrieved/ { skipping=1; next } + /^future:/ { skipping=1; next } + /^Traceback \(most recent call last\):/ { skipping=1; next } + /^ File "/ { next } + /^src\.lib\.utils\.osmo_errors\.OSMOUserError/ { + sub(/^src\.lib\.utils\.osmo_errors\.OSMOUserError: */, "") + printf "\033[0;31m[ERROR]\033[0m %s (run `airstack osmo:up` to start a new workflow)\n", $0 + next + } + /OSMOUserError: Workflow .* is not running!/ { + printf "\033[0;31m[ERROR]\033[0m Workflow %s is no longer running (run `airstack osmo:up` to start a new one).\n", WF + next + } + skipping && /^$/ { skipping=0; next } + skipping { next } + { print } + ' >&2 +} + +# Helper: run `osmo workflow port-forward` with the noise filter +# attached. Returns the underlying exit code so callers can decide +# whether to retry / fail. Args after the helper name are passed to +# `osmo workflow port-forward` verbatim. +function _osmo_run_port_forward { + osmo workflow port-forward "$@" 2> >(_osmo_pf_filter "$1") +} + +# osmo:webrtc — forward both Isaac Sim WebRTC port ranges (TCP in this +# terminal, spawn UDP in the background). Cleans up the UDP child on +# exit (Ctrl+C, foreground TCP failure, or the workflow disappearing +# mid-stream) so we don't leak a port-forward into the user's process +# table. +function cmd_osmo_webrtc { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + log_info "Spawning UDP port-forward in background: ${OSMO_WEBRTC_UDP}" + nohup osmo workflow port-forward "$wf" workspace \ + --port "$OSMO_WEBRTC_UDP" --udp \ + --connect-timeout "$OSMO_PF_TIMEOUT" \ + > "${OSMO_STATE_DIR}/webrtc-udp.log" 2>&1 & + local udp_pid=$! + log_info " UDP log: ${OSMO_STATE_DIR}/webrtc-udp.log (pid ${udp_pid})" + + # Tear the UDP fork down when this function exits, by any path. + # Without this, hitting Ctrl+C on the TCP foreground (or the + # workflow being canceled, which surfaces as the foreground exiting + # non-zero) leaves the UDP `osmo workflow port-forward` running + # against a dead workflow until the user notices and pkill's it. + trap ' + if kill -0 "'"${udp_pid}"'" 2>/dev/null; then + kill "'"${udp_pid}"'" 2>/dev/null + wait "'"${udp_pid}"'" 2>/dev/null + fi + trap - EXIT INT TERM + ' EXIT INT TERM + + log_info "Foreground TCP port-forward: ${OSMO_WEBRTC_TCP}" + log_info "Open the Omniverse Streaming Client / WebRTC client at http://localhost" + _osmo_run_port_forward "$wf" workspace \ + --port "$OSMO_WEBRTC_TCP" \ + --connect-timeout "$OSMO_PF_TIMEOUT" +} + +# osmo:foxglove — install the AirStack Foxglove extensions into the local +# Foxglove Desktop user-extensions dir, then forward the GCS Foxglove +# websocket. +# +# The extension install is the same script the GCS container runs on +# startup — gcs/foxglove_extensions/install.py — invoked with env-var +# overrides that point at the local laptop dirs. Default destination on +# Linux/macOS is ~/.foxglove-studio/extensions (Foxglove's canonical user +# extensions path; the macOS rebrand still reads from here). Override +# with OSMO_FOXGLOVE_EXT_DIR, or skip the install entirely with +# OSMO_FOXGLOVE_SKIP_EXTENSIONS=1 (e.g. when using app.foxglove.dev +# which doesn't load local extensions anyway). +function cmd_osmo_foxglove { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + local ext_src="${PROJECT_ROOT}/gcs/foxglove_extensions" + local ext_dst="${OSMO_FOXGLOVE_EXT_DIR:-${HOME}/.foxglove-studio/extensions}" + + if [ "${OSMO_FOXGLOVE_SKIP_EXTENSIONS:-0}" != "1" ] && [ -d "${ext_src}" ]; then + if command -v python3 >/dev/null 2>&1; then + log_info "Installing Foxglove extensions to ${ext_dst}" + FOXGLOVE_EXT_SRC="${ext_src}" FOXGLOVE_EXT_DST="${ext_dst}" \ + python3 "${ext_src}/install.py" \ + || log_warn "Foxglove extension install failed; panels like 'Robot Tasks' may show as 'Unknown panel type' in Foxglove" + else + log_warn "python3 not found on PATH — skipping Foxglove extension install." + log_warn " Custom panels (Robot Tasks, Waypoint Editor, Polygon Editor) will show as 'Unknown panel type'." + log_warn " Install python3 (e.g. 'brew install python') or copy ${ext_src}/* manually to ${ext_dst}." + fi + elif [ "${OSMO_FOXGLOVE_SKIP_EXTENSIONS:-0}" = "1" ]; then + log_info "Skipping Foxglove extension install (OSMO_FOXGLOVE_SKIP_EXTENSIONS=1)." + fi + + log_info "osmo workflow port-forward ${wf} workspace --port ${OSMO_FOXGLOVE_PORT} --connect-timeout ${OSMO_PF_TIMEOUT}" + log_info "Then in Foxglove Desktop: Open connection → ws://localhost:8766" + log_info " Layouts → Import from file → ${ext_src}/airstack_default.json" + log_info " (Restart Foxglove Desktop once if newly-installed panels still show as 'Unknown panel type'.)" + _osmo_run_port_forward "$wf" workspace \ + --port "$OSMO_FOXGLOVE_PORT" \ + --connect-timeout "$OSMO_PF_TIMEOUT" +} + +# osmo:down — cancel the active workflow. Reminds you to push first. +function cmd_osmo_down { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + log_warn "About to cancel workflow '${wf}'." + log_warn "Anything not pushed to git in /root/AirStack inside the pod will be LOST." + log_warn "Hit Ctrl-C in the next 5 seconds to abort." + sleep 5 + osmo workflow cancel "$wf" + rm -f "${OSMO_STATE_FILE}" +} + +# Register commands from this module. +function register_osmo_commands { + COMMANDS["osmo:setup"]="cmd_osmo_setup" + COMMANDS["osmo:up"]="cmd_osmo_up" + COMMANDS["osmo:logs"]="cmd_osmo_logs" + COMMANDS["osmo:ide"]="cmd_osmo_ide" + COMMANDS["osmo:webrtc"]="cmd_osmo_webrtc" + COMMANDS["osmo:foxglove"]="cmd_osmo_foxglove" + COMMANDS["osmo:down"]="cmd_osmo_down" + + COMMAND_HELP["osmo:setup"]="One-time per-user OSMO credential setup (airlab-docker-registry, airlab-docker-login, airlab-nucleus)" + COMMAND_HELP["osmo:up"]="Submit osmo/workflows/airstack-dev.yaml with your SSH pubkey injected (--pool POOL, --key PATH, --branch BRANCH)" + COMMAND_HELP["osmo:logs"]="Follow the workspace task logs (osmo workflow logs -t workspace -n 500; OSMO_LOGS_TASK / OSMO_LOGS_TAIL override)" + COMMAND_HELP["osmo:ide"]="Port-forward sshd (2200:22) and open VS Code/Cursor on Host airstack-osmo" + COMMAND_HELP["osmo:webrtc"]="Port-forward Isaac Sim WebRTC ranges (TCP foreground + UDP background)" + COMMAND_HELP["osmo:foxglove"]="Install AirStack Foxglove extensions locally, then port-forward GCS Foxglove websocket (8766:8766). Override target dir with OSMO_FOXGLOVE_EXT_DIR; skip install with OSMO_FOXGLOVE_SKIP_EXTENSIONS=1." + COMMAND_HELP["osmo:down"]="Cancel the active workflow (push to git before running this)" +} diff --git a/.env b/.env index 1edaa22f7..770ca77e6 100644 --- a/.env +++ b/.env @@ -11,7 +11,8 @@ PROJECT_NAME="airstack" # If you've run ./airstack.sh setup, then this will auto-generate from the git commit hash every time a change is made # to a Dockerfile or docker-compose.yaml file. Otherwise this can also be set explicitly to make a release version. -VERSION="0.18.0-alpha.6" +# auto-generated from git commit hash +VERSION="0.19.0-alpha.4" # Choose "dev" or "prebuilt". "dev" is for mounted code that must be built live. "prebuilt" is for built ros_ws baked into the image DOCKER_IMAGE_BUILD_MODE="dev" # Where to push and pull images from. Can replace with your docker hub username if using docker hub. @@ -34,6 +35,7 @@ ISAAC_SIM_USE_STANDALONE="true" # "true" or "false" # Script name (must be in /AirStack/simulation/isaac-sim/launch_scripts/) ISAAC_SIM_SCRIPT_NAME="example_one_px4_pegasus_launch_script.py" PLAY_SIM_ON_START="false" + # =============================================== # ================= MS-AIRSIM ===================== @@ -47,6 +49,8 @@ ROBOT_NAME_MAP_CONFIG_FILE="default_robot_name_map.yaml" # Determines how to se URDF_FILE="robot_descriptions/iris/urdf/iris_with_sensors.pegasus.robot.urdf" +DEBUG_RVIZ="false" # "true" or "false". If true, launches RViz alongside the robot via desktop_bringup/robot.launch.xml. + # offboard API streaming out. this is so that ports don't conflict for multi-agent FCU communication. OFFBOARD_BASE_PORT=14540 -ONBOARD_BASE_PORT=14580 +ONBOARD_BASE_PORT=14580 \ No newline at end of file diff --git a/.github/orchestrator/README.md b/.github/orchestrator/README.md new file mode 100644 index 000000000..c10da3383 --- /dev/null +++ b/.github/orchestrator/README.md @@ -0,0 +1,288 @@ +# AirStack CI Orchestrator + +This describes how to use a self-hosted OpenStack VM to run GitHub Actions jobs on truly ephemeral workers. The orchestrator is a Python service that continuously polls GitHub for queued workflow jobs, spawns a fresh OpenStack instance for each one with a single-use JIT runner token, and reaps (deletes) the instance when the job completes. This allows us to run CI workloads on GPU-equipped VMs without sharing any state between runs or exposing long-lived credentials on the worker. + +The orchestrator VM is the only host that holds the GitHub PAT and the OpenStack credential; the workers are destroyed after a single job. + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Orchestrator VM (airstack-ci-cd-orchestrator) │ +│ │ +│ airstack-orchestrator.service → orchestrator.py │ +│ spawn loop (every 15s): │ +│ • GET /repos//actions/runs?status=queued │ +│ • POST /repos//actions/runners/generate-jitconfig│ +│ • openstack server create (image, flavor, user_data) │ +│ • record (job_id → server_id) in state.json │ +│ reap loop (every 30s): │ +│ • job completed → openstack server delete │ +│ • job age > N min → force delete (straggler) │ +│ • owned but not in state → orphan reap │ +│ │ +│ /etc/airstack-orchestrator/ │ +│ config.yaml │ +│ github-pat │ +│ /home/orchestrator/.config/openstack/clouds.yaml │ +│ /var/lib/airstack-orchestrator/state.json │ +└─────────┬─────────────────────────────────┬─────────────────┘ + │ Nova / Neutron API │ GitHub REST API + ▼ ▼ +┌──────────────────────────────────┐ ┌──────────────────────┐ +│ Ephemeral worker (per job) │ │ GitHub Actions │ +│ Image: Ubuntu-24.04-GPU-Headless│ │ workflow_job queue │ +│ cloud-init: │ └──────────────────────┘ +│ install docker + nv toolkit │ +│ download GH runner │ +│ run.sh --jitconfig │ +│ shutdown -h +1 │ +└──────────────────────────────────┘ +``` + +Key properties: + +- **Truly ephemeral**: every job runs on a clean VM. No Docker layer cache pollution, no leftover networks, no carry-over from prior runs. +- **PAT isolation**: the GitHub PAT lives only on the orchestrator. Workers receive a single-use [JIT runner config](https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-configuration-for-a-just-in-time-runner-for-a-repository) — a base64 token bound to one runner registration, valid only for a short window. +- **Application-credential auth**: the orchestrator authenticates to OpenStack with an application credential (revocable, scoped, no password), not the user's `openrc.sh`. +- **Crash-safe reaping**: every server we spawn is tagged with `airstack-role=ephemeral-runner`. The reap loop force-deletes any owned server not present in `state.json`, so a crashed orchestrator can't leak instances. + +## Prerequisites +- OpenStack instance already setup for the orchestrator VM. The orchestrator itself is lightweight and doesn't need a GPU. 1 vCPU, 2GB RAM, and 20GB disk is sufficient for the orchestrator service. Make sure you can ssh into it and that it has outbound internet access. +- An OpenStack flavor with GPU passthrough and enough disk to run Docker + the tests. The orchestrator spawns workers from this flavor, so it must have a GPU and sufficient disk (or `boot_volume_size_gb` must be set) to run the workloads. It's common for GPU flavors to have `disk=0`, which means they boot from an ephemeral disk — in that case, you must set `boot_volume_size_gb` to a value large enough for the OS + Docker images + test assets (e.g., 40GB). If your OpenStack setup supports it, you can also boot from a Cinder volume sourced from an image; in that case, pre-bake Docker and the NVIDIA toolkit into the image to speed up boot time. + +## One-time setup + +### 1. Create OpenStack application credential + +On your local workstation (not the orchestrator VM): + +```bash +source ~/.airlabcloud/openrc.sh +openstack application credential create airstack-orchestrator \ + --description "AirStack CI orchestrator — spawns ephemeral test runners" +``` + +The output prints `id` and `secret`. Build a `clouds.yaml`: + +```yaml +clouds: + airstack: + auth_type: v3applicationcredential + auth: + auth_url: https://airlab-cloud.andrew.cmu.edu:5000/v3/ + application_credential_id: + application_credential_secret: + region_name: Airlab + interface: public + identity_api_version: 3 +``` + +### 2. Stage credentials on the orchestrator VM + +```bash +# clouds.yaml: install for the orchestrator user (created in step 3) +scp clouds.yaml ubuntu@:/tmp/clouds.yaml + +# GitHub PAT: needs `Actions: read/write` and `Administration: read/write` +# (fine-grained) or classic `repo` scope. +scp ~/.airlabcloud/airstack-github-pat.txt \ + ubuntu@:/tmp/github-pat +``` + +### 3. Run setup.sh + +On the orchestrator VM: + +```bash +git clone https://github.com/castacks/AirStack.git /tmp/airstack +sudo bash /tmp/airstack/.github/orchestrator/setup.sh +``` + +`setup.sh` creates the `orchestrator` system user, builds the Python venv, copies `orchestrator.py` and `cloud-init.yaml.j2` into `/opt/airstack-orchestrator/`, scaffolds `/etc/airstack-orchestrator/`, installs the systemd unit, and consumes `/tmp/github-pat`. + +You still need to put the `clouds.yaml` in place under the orchestrator user's home: + +```bash +sudo install -d -o orchestrator -g orchestrator -m 0700 \ + /home/orchestrator/.config/openstack +sudo install -o orchestrator -g orchestrator -m 0600 \ + /tmp/clouds.yaml /home/orchestrator/.config/openstack/clouds.yaml +sudo shred -u /tmp/clouds.yaml +``` + +### 4. Fill in `/etc/airstack-orchestrator/config.yaml` + +Edit the placeholders the example ships with: + +| Field | What goes here | How to find it | +|------|---------------|----------------| +| `flavor_name` | OpenStack flavor with GPU + enough disk | `openstack flavor list` | +| `network_name` | Network the workers attach to | `openstack network list` | +| `keypair_name` | SSH keypair for break-glass access | `openstack keypair list` | +| `security_group` | Outbound 443 must be allowed | `openstack security group list` | +| `availability_zone` | Optional AZ for the spawned instance; leave empty to let Nova pick | `openstack availability zone list` | +| `boot_volume_size_gb` | Set >0 if your flavor has `disk=0` (common for GPU flavors) — boots from a Cinder volume of this size sourced from `image_id`; leave 0 for direct image-boot | `openstack flavor show ` (check disk field) | +| `floating_ips` | Pre-allocated FIP pool, rotated through sequentially — each spawn picks the first free one. `max_concurrent` is capped at `len(pool)`. Leave empty to skip FIP attachment | `openstack floating ip list` | +| `repo` | `owner/name` of the repo to poll | from GitHub URL | +| `runner_version` | Version tag from [actions/runner releases](https://github.com/actions/runner/releases) | check before each major upgrade | + +### 5. Start the service + +```bash +sudo systemctl enable --now airstack-orchestrator.service +journalctl -u airstack-orchestrator.service -f +``` + +You should see `orchestrator started: repo=... labels=... max_concurrent=N` and then periodic poll activity. + +## End-to-end verification + +```bash +# Trigger a fast build-only run. +gh workflow run system-tests.yml -f marks=build_docker + +# Within ~30s, a server should appear: +openstack server list --metadata airstack-role=ephemeral-runner +# or if your OpenStack setup doesn't support metadata queries: +openstack server list --name '^ephemeral-' + +# Watch GitHub → Actions → Runners — the ephemeral runner should appear, +# pick up the job, then disappear. + +# Within ~30s of job completion, the server should be gone: +openstack server list --metadata airstack-role=ephemeral-runner +openstack server list --name '^ephemeral-' +``` + +## Operational notes + +- **State file**: `/var/lib/airstack-orchestrator/state.json` is the in-flight job tracker. Wiping it triggers an orphan sweep on the next reap iteration — owned servers will be force-deleted. Don't wipe it while jobs are mid-flight unless that's what you want. +- **Stuck instance**: any server older than `max_job_minutes` (default 90) is force-deleted regardless of GitHub job status. Bump this if liveliness/autonomy runs grow longer than ~75 minutes. +- **PAT rotation**: `sudo install -o root -g orchestrator -m 0640 /tmp/new-pat /etc/airstack-orchestrator/github-pat && sudo systemctl restart airstack-orchestrator.service`. +- **Pause spawning** (e.g. for maintenance): `sudo systemctl stop airstack-orchestrator.service`. Already-spawned workers will still complete their jobs and self-shutdown; on restart, the reap loop deletes them. +- **Logs**: `journalctl -u airstack-orchestrator.service -f`. Cloud-init logs from individual workers are visible only via `openstack console log show ` while the worker is running. + +## Debugging a failed job + +When a GitHub workflow run fails or stalls, the failure can be in any of four places: the orchestrator (didn't spawn), cloud-init (didn't bootstrap), the GH Actions runner (didn't register or crashed), or the workflow steps themselves. Each has a different inspection path. + +### 1. Find which worker ran the job + +`state.json` is the authoritative job ↔ server ↔ floating-IP map: + +```bash +sudo jq -r '.jobs | to_entries[] | "\(.key)\t\(.value.server_id)\t\(.value.floating_ip)\t\(.value.runner_name)"' \ + /var/lib/airstack-orchestrator/state.json +``` + +Pick the row for your failing `job_id` (visible in the GitHub Actions URL). Save the values: + +```bash +JOB_ID=73286176852 # from the GitHub UI +SERVER=$(sudo jq -r ".jobs[\"$JOB_ID\"].server_id" /var/lib/airstack-orchestrator/state.json) +FIP=$( sudo jq -r ".jobs[\"$JOB_ID\"].floating_ip" /var/lib/airstack-orchestrator/state.json) +``` + +If the job isn't in `state.json`, the orchestrator never spawned for it — see step 2 below. + +### 2. Did the orchestrator spawn at all? + +```bash +sudo journalctl -u airstack-orchestrator.service --since "30 min ago" --no-pager +``` + +What you want to see for a healthy spawn: + +```text +spawned server for job () +attached floating IP to server (job ) +``` + +Common things that block a spawn (and how to spot them): + +| Log line / symptom | What it means | Fix | +|---|---|---| +| `find_queued_jobs failed: 401 ...` | PAT expired / wrong scope | Rotate the PAT (see Operational notes) | +| `spawn failed for job ...: Block Device Mapping is Invalid` | Flavor has `disk=0` and `boot_volume_size_gb` is 0 | Set `boot_volume_size_gb > 0` | +| `no free floating IP in pool` | All FIPs in `floating_ips` are already in use | Wait for an in-flight job to complete, or expand the pool | +| `floating_ips configured but not found` | Pool addresses don't exist in the project | Double-check `openstack floating ip list` | +| Job is queued in GitHub but no `spawned` log | Runner labels in the workflow's `runs-on` don't match `runner_labels` in config | Make them match | + +### 3. SSH into a running worker + +If the worker is `ACTIVE`, the floating IP is attached and you can connect directly. The keypair was injected during spawn — use the matching private key: + +```bash +ssh -i .pem ubuntu@"$FIP" +``` + +If your workstation can't reach the FIP subnet, jump through the orchestrator (which is on the same network): + +```bash +ssh -J ubuntu@ -i .pem ubuntu@"$FIP" +``` + +### 4. SSH into a SHUTOFF worker + +Workers shut themselves down after `run.sh` exits (whether the job succeeded, failed, or the runner crashed). The orchestrator only deletes a server once GitHub reports the job `completed`, so a SHUTOFF worker is preserved while you debug. + +```bash +# Optional but safer — keep the orchestrator from reaping mid-session. +sudo systemctl stop airstack-orchestrator.service + +openstack server start "$SERVER" +# Wait ~30s, then SSH using the FIP from state.json. +ssh -i .pem ubuntu@"$FIP" +``` + +When done, delete the worker manually and resume the orchestrator: + +```bash +openstack server delete "$SERVER" +sudo jq "del(.jobs[\"$JOB_ID\"])" /var/lib/airstack-orchestrator/state.json \ + | sudo tee /var/lib/airstack-orchestrator/state.json.new >/dev/null +sudo mv /var/lib/airstack-orchestrator/state.json.new /var/lib/airstack-orchestrator/state.json +sudo systemctl start airstack-orchestrator.service +``` + +### 5. What to read once you're on the worker + +```bash +# Combined boot + cloud-init output. Most useful single file: shows every +# line our airstack-runner-bootstrap.sh printed, including run.sh's exit. +sudo less /var/log/cloud-init-output.log +sudo tail -300 /var/log/cloud-init-output.log + +# Cloud-init's structured log — quick way to surface errors. +sudo grep -E 'WARN|ERROR|FAIL' /var/log/cloud-init.log + +# GitHub Actions runner diagnostics. The Worker_*.log corresponds to the +# actual job execution; Runner_*.log covers registration and dispatch. +ls -lt /home/ubuntu/actions-runner/_diag/ +sudo tail -300 /home/ubuntu/actions-runner/_diag/Runner_*.log +sudo tail -300 /home/ubuntu/actions-runner/_diag/Worker_*.log + +# Sanity-check Docker came up cleanly — a frequent failure point. +sudo systemctl status docker +docker info 2>&1 | head +``` + +### 6. Console log fallback + +Some flavors on this cloud don't expose the serial console (`openstack console log show` returns *Guest does not have a console available*). For those, the SSH path above is the only option. Where it does work, the console log persists across SHUTOFF and is faster than restarting the VM: + +```bash +openstack console log show "$SERVER" | tail -200 +``` + +### 7. Common failure patterns at the worker + +| Symptom in `cloud-init-output.log` (near end) | Cause | Fix | +|---|---|---| +| `Could not connect to api.github.com` / DNS errors | Security group blocking egress, or no NAT for the network | Allow outbound 443; if behind NAT, ensure FIP networking covers egress | +| `Bad credentials` / `Invalid configuration ... runnerEvent` | JIT config TTL elapsed before `run.sh` started — bootstrap took too long | Pre-bake Docker + nvidia-container-toolkit into the image to shrink bootstrap | +| `nvidia-ctk: command not found` or NVIDIA driver mismatch | Image's driver doesn't match the toolkit version | Use a different image, or pin a compatible toolkit version | +| `apt-get update` fails | Image's apt sources are unreachable from this network | Check network/security-group; or pre-bake packages into the image | +| Runner registered, then `pytest` failed | A normal test failure | Read the GitHub Actions log — that's the canonical view of the workflow output | +| `No space left on device` | `boot_volume_size_gb` too small for Docker images + sim assets | Bump `boot_volume_size_gb` | diff --git a/.github/orchestrator/airstack-orchestrator.service b/.github/orchestrator/airstack-orchestrator.service new file mode 100644 index 000000000..7123232eb --- /dev/null +++ b/.github/orchestrator/airstack-orchestrator.service @@ -0,0 +1,40 @@ +[Unit] +Description=AirStack CI Orchestrator (spawns ephemeral OpenStack runners) +Documentation=https://github.com/castacks/AirStack/tree/main/.github/orchestrator +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=orchestrator +Group=orchestrator +WorkingDirectory=/opt/airstack-orchestrator + +# Application credential lives in the orchestrator user's home so openstacksdk +# finds it via the default cloud-config search path. +Environment=HOME=/home/orchestrator +Environment=OS_CLIENT_CONFIG_FILE=/home/orchestrator/.config/openstack/clouds.yaml + +ExecStart=/opt/airstack-orchestrator/venv/bin/python \ + /opt/airstack-orchestrator/orchestrator.py \ + --config /etc/airstack-orchestrator/config.yaml \ + --pat /etc/airstack-orchestrator/github-pat \ + --state /var/lib/airstack-orchestrator/state.json \ + --template /opt/airstack-orchestrator/cloud-init.yaml.j2 + +Restart=always +RestartSec=10 + +# Allow draining loops on stop (SIGTERM handled by orchestrator.py). +TimeoutStopSec=30 +KillSignal=SIGTERM + +# Hardening +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=read-only +ReadWritePaths=/var/lib/airstack-orchestrator +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/.github/orchestrator/cloud-init.yaml.j2 b/.github/orchestrator/cloud-init.yaml.j2 new file mode 100644 index 000000000..921417c18 --- /dev/null +++ b/.github/orchestrator/cloud-init.yaml.j2 @@ -0,0 +1,71 @@ +#cloud-config +# Rendered per-spawn by orchestrator.py with two Jinja variables: +# encoded_jit_config - single-use base64 JIT config from GitHub +# runner_version - GitHub Actions runner version (e.g. 2.334.0) +# +# The base image (Ubuntu-24.04-GPU-Headless) already has NVIDIA drivers. +# This cloud-init adds Docker (with the compose plugin), nvidia-container-toolkit, +# downloads the GitHub Actions runner, registers it with the JIT config, runs +# exactly one job (the JIT config + --ephemeral makes the runner exit after one +# job), and shuts the VM down. The orchestrator then deletes the server. + +package_update: true +package_upgrade: false +packages: + - jq + - curl + - ca-certificates + - gnupg + +write_files: + - path: /usr/local/bin/airstack-runner-bootstrap.sh + permissions: "0755" + owner: root:root + content: | + #!/usr/bin/env bash + set -euxo pipefail + + # Install Docker (with compose plugin) from Docker's official channel. + # get.docker.com handles apt repo setup + nvidia-container-toolkit-compatible + # docker-ce, plus the docker-compose-plugin we need for `airstack up`. + curl -fsSL https://get.docker.com | sh + + # nvidia-container-toolkit is required for GPU containers (liveliness / + # autonomy tests). The base image has the NVIDIA *drivers* but we still + # need the container runtime hooks here. + distribution=$(. /etc/os-release; echo "$ID$VERSION_ID") + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -fsSL "https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list" \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + > /etc/apt/sources.list.d/nvidia-container-toolkit.list + apt-get update + apt-get install -y nvidia-container-toolkit + nvidia-ctk runtime configure --runtime=docker + systemctl restart docker + + usermod -aG docker ubuntu + + # GitHub Actions runner. + RUNNER_VERSION="{{ runner_version }}" + RUNNER_DIR=/home/ubuntu/actions-runner + mkdir -p "$RUNNER_DIR" + cd "$RUNNER_DIR" + curl -fsSL -o runner.tar.gz \ + "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" + tar xzf runner.tar.gz + rm runner.tar.gz + chown -R ubuntu:ubuntu "$RUNNER_DIR" + + # Run exactly one job under the ubuntu user. The JIT config is single-use + # and ephemeral, so run.sh exits after one job completes. + sudo -u ubuntu --preserve-env=HOME -H bash -c \ + "cd '$RUNNER_DIR' && ./run.sh --jitconfig '{{ encoded_jit_config }}'" \ + || echo "runner exited non-zero (job failure or runner error)" + + # Backstop: power down. The orchestrator's reap loop is the authoritative + # deleter — it sees the GitHub job complete and calls Nova delete. + shutdown -h +1 + +runcmd: + - /usr/local/bin/airstack-runner-bootstrap.sh diff --git a/.github/orchestrator/config.example.yaml b/.github/orchestrator/config.example.yaml new file mode 100644 index 000000000..4a47bbe1f --- /dev/null +++ b/.github/orchestrator/config.example.yaml @@ -0,0 +1,87 @@ +# AirStack CI orchestrator configuration. +# Copy to /etc/airstack-orchestrator/config.yaml and fill in placeholders. + +# --- OpenStack target --- + +# Cloud profile name in ~/.config/openstack/clouds.yaml. +openstack_cloud: airstack + +# Ubuntu-24.04-Desktop (confirmed available on airlab-cloud). +image_id: 2ebb9061-8995-4238-a3cc-e230a3e863aa + +# OpenStack flavor with GPU + enough disk for Docker + sim images. +# Look up with: openstack flavor list +flavor_name: "gpu.rtxpro5000.1" + +# OpenStack network the ephemeral instance attaches to. Must allow outbound +# 443 to api.github.com (no inbound is required: the runner makes an outbound +# long-poll connection to GitHub). +network_name: "airstack.AirLab.Apps_group_network_gates" + +# OpenStack keypair injected into the instance for break-glass SSH access. +# The orchestrator never SSHes into workers itself. +keypair_name: "airstack-ci-cd" + +# Security group applied to spawned instances. Outbound 443 must be allowed. +security_group: "default" + +# OpenStack availability zone to spawn instances in (e.g. nova, gpu-zone-1). +# Leave empty to let Nova pick. +availability_zone: "gates" + +# If the chosen flavor has disk=0 (common for GPU flavors), Nova rejects +# direct image-boot with "Block Device Mapping is Invalid: You specified more +# local devices than the limit allows". Set this to >0 to boot from a Cinder +# volume of that size sourced from image_id (deleted on termination). Leave +# at 0 to boot directly from the image (only works for non-zero-disk flavors). +boot_volume_size_gb: 300 + +# Pre-allocated pool of floating IPs to rotate through for SSH access to +# workers. The orchestrator picks the first free IP from this list, in order, +# for each new spawn. When the worker is destroyed the IP auto-disassociates +# and returns to the pool. If non-empty, max_concurrent is capped at len(pool) +# so the orchestrator never spawns a worker it can't address. +# Allocate via: openstack floating ip create +# Leave empty to skip floating-IP attachment entirely. +floating_ips: [] +# Example: +# floating_ips: +# - 172.19.220.131 +# - 172.19.220.171 +# - 172.19.220.89 + +# --- GitHub --- + +# owner/name of the repo whose queued workflow_jobs to pick up. +repo: "castacks/AirStack" + +# Labels the orchestrator polls for. A queued workflow_job whose `labels` +# array is a superset of this list gets a server spawned for it. +runner_labels: + - self-hosted + - airstack-ephemeral + +# GitHub Actions runner version (must exist as a release tag at +# https://github.com/actions/runner/releases). +runner_version: "2.334.0" + +# --- Limits --- + +# Maximum simultaneous in-flight ephemeral instances. +max_concurrent: 3 + +# Hard ceiling for a single job. Past this age the reaper force-deletes the +# server even if GitHub still reports the job as in-progress. Must comfortably +# exceed the longest expected job (autonomy/liveliness runs). +max_job_minutes: 2880 # 48 hours + +# --- Polling intervals (seconds) --- + +spawn_poll_interval_s: 15 +reap_poll_interval_s: 30 + +# How long to wait for a freshly-created server to reach ACTIVE before +# treating the spawn as failed. If Nova flips the server to ERROR within this +# window the orchestrator logs the full fault (code/message/details/host/AZ) +# and deletes the server so the next iteration can retry cleanly. +server_active_timeout_s: 300 diff --git a/.github/orchestrator/orchestrator.py b/.github/orchestrator/orchestrator.py new file mode 100644 index 000000000..3e65e906f --- /dev/null +++ b/.github/orchestrator/orchestrator.py @@ -0,0 +1,698 @@ +#!/usr/bin/env python3 +"""AirStack CI orchestrator. + +Polls the GitHub API for queued workflow_jobs whose labels match this +orchestrator's runner_labels, and spawns truly ephemeral OpenStack instances +to execute them. Each ephemeral instance receives a single-use GitHub JIT +runner config via cloud-init; the GitHub PAT never leaves this orchestrator. + +Two cooperating loops: + - spawn loop: discover queued jobs, spawn one Nova server per job + - reap loop: delete servers whose jobs have completed, plus stragglers + older than max_job_minutes and orphans not in state.json + +State persists in /var/lib/airstack-orchestrator/state.json so the +orchestrator can survive restarts without leaking instances. +""" + +from __future__ import annotations + +import argparse +import base64 +import json +import logging +import os +import signal +import sys +import threading +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import openstack +import requests +import yaml +from jinja2 import Template + +DEFAULT_CONFIG_PATH = "/etc/airstack-orchestrator/config.yaml" +DEFAULT_PAT_PATH = "/etc/airstack-orchestrator/github-pat" +DEFAULT_STATE_PATH = "/var/lib/airstack-orchestrator/state.json" +DEFAULT_TEMPLATE_PATH = "/opt/airstack-orchestrator/cloud-init.yaml.j2" + +# Metadata key/value applied to every Nova server we spawn. Used by the +# orphan reaper to identify servers we own even when state.json is missing. +ROLE_META_KEY = "airstack-role" +ROLE_META_VAL = "ephemeral-runner" +JOB_META_KEY = "airstack-job-id" + +GITHUB_API = "https://api.github.com" + +log = logging.getLogger("orchestrator") + + +def load_yaml(path: str) -> dict: + with open(path) as f: + return yaml.safe_load(f) + + +def load_pat(path: str) -> str: + with open(path) as f: + return f.read().strip() + + +def load_state(path: str) -> dict: + if not os.path.exists(path): + return {"jobs": {}} + with open(path) as f: + return json.load(f) + + +def save_state(path: str, state: dict) -> None: + Path(path).parent.mkdir(parents=True, exist_ok=True) + tmp = path + ".tmp" + with open(tmp, "w") as f: + json.dump(state, f, indent=2, sort_keys=True) + os.replace(tmp, path) + + +def gh_request(method: str, path: str, pat: str, **kwargs: Any) -> Any: + url = f"{GITHUB_API}{path}" + headers = kwargs.pop("headers", {}) + headers.update( + { + "Authorization": f"Bearer {pat}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + ) + r = requests.request(method, url, headers=headers, timeout=30, **kwargs) + r.raise_for_status() + if not r.text: + return None + return r.json() + + +def find_queued_jobs(repo: str, runner_labels: list[str], pat: str) -> list[dict]: + """Return queued workflow_jobs whose labels include all runner_labels.""" + runs = gh_request( + "GET", f"/repos/{repo}/actions/runs?status=queued&per_page=20", pat + ) + label_set = set(runner_labels) + matches: list[dict] = [] + for run in runs.get("workflow_runs", []): + jobs = gh_request("GET", f"/repos/{repo}/actions/runs/{run['id']}/jobs", pat) + for job in jobs.get("jobs", []): + if job.get("status") != "queued": + continue + if not label_set.issubset(set(job.get("labels", []))): + continue + if job.get("runner_id"): + continue + matches.append( + { + "job_id": str(job["id"]), + "run_id": run["id"], + "name": job["name"], + "labels": job["labels"], + } + ) + return matches + + +def mint_jit_config( + repo: str, runner_name: str, runner_labels: list[str], pat: str, + runner_group_id: int = 1, +) -> str: + body = { + "name": runner_name, + "runner_group_id": runner_group_id, + "labels": runner_labels, + } + resp = gh_request( + "POST", + f"/repos/{repo}/actions/runners/generate-jitconfig", + pat, + json=body, + ) + return resp["encoded_jit_config"] + + +def get_job_status(repo: str, job_id: str, pat: str) -> dict | None: + """Return the job dict, or None if 404 (job purged).""" + url = f"{GITHUB_API}/repos/{repo}/actions/jobs/{job_id}" + r = requests.get( + url, + headers={ + "Authorization": f"Bearer {pat}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + }, + timeout=30, + ) + if r.status_code == 404: + return None + r.raise_for_status() + return r.json() + + +def render_cloud_init(template_path: str, encoded_jit_config: str, + runner_version: str) -> str: + with open(template_path) as f: + tmpl = Template(f.read()) + return tmpl.render( + encoded_jit_config=encoded_jit_config, + runner_version=runner_version, + ) + + +def spawn_server( + conn: openstack.connection.Connection, + config: dict, + name: str, + job_id: str, + user_data: str, +) -> str: + flavor = conn.compute.find_flavor(config["flavor_name"], ignore_missing=False) + network = conn.network.find_network(config["network_name"], ignore_missing=False) + create_kwargs = dict( + name=name, + flavor_id=flavor.id, + networks=[{"uuid": network.id}], + key_name=config["keypair_name"], + security_groups=[{"name": config["security_group"]}], + user_data=base64.b64encode(user_data.encode()).decode(), + metadata={ + ROLE_META_KEY: ROLE_META_VAL, + JOB_META_KEY: job_id, + }, + ) + + # Flavors with disk=0 (typical for GPU flavors on this cloud) cannot boot + # directly from an image — Nova rejects with "Block Device Mapping is + # Invalid: You specified more local devices than the limit allows". When + # boot_volume_size_gb is set, boot from a Cinder volume sourced from the + # image and delete it on termination. Otherwise fall back to direct image + # boot (works only when the flavor has a non-zero root disk). + boot_volume_size_gb = int(config.get("boot_volume_size_gb") or 0) + if boot_volume_size_gb > 0: + create_kwargs["block_device_mapping"] = [ + { + "uuid": config["image_id"], + "source_type": "image", + "destination_type": "volume", + "boot_index": 0, + "volume_size": boot_volume_size_gb, + "delete_on_termination": True, + } + ] + else: + create_kwargs["image_id"] = config["image_id"] + + az = config.get("availability_zone") + if az: + create_kwargs["availability_zone"] = az + server = conn.compute.create_server(**create_kwargs) + return server.id + + +def delete_server(conn: openstack.connection.Connection, server_id: str) -> None: + try: + conn.compute.delete_server(server_id, ignore_missing=True, force=True) + except Exception as e: + log.warning("delete_server(%s) failed: %s", server_id, e) + + +def list_owned_servers(conn: openstack.connection.Connection) -> list[Any]: + """List all Nova servers that carry our role metadata.""" + owned = [] + for s in conn.compute.servers(details=True): + meta = getattr(s, "metadata", None) or {} + if meta.get(ROLE_META_KEY) == ROLE_META_VAL: + owned.append(s) + return owned + + +def find_free_floating_ip( + conn: openstack.connection.Connection, pool: list[str] +) -> Any: + """Return the FloatingIP resource for the first address in `pool` that is + not currently associated with any port. Returns None if all are in use. + + Iterates `pool` in order so attachments rotate through it sequentially. + Logs a warning for any pool member that doesn't exist in this project. + """ + if not pool: + return None + pool_set = set(pool) + fips_by_addr: dict[str, Any] = {} + for fip in conn.network.ips(): + if fip.floating_ip_address in pool_set: + fips_by_addr[fip.floating_ip_address] = fip + missing = pool_set - fips_by_addr.keys() + if missing: + log.warning( + "floating_ips configured but not found in this project: %s", + sorted(missing), + ) + for addr in pool: + fip = fips_by_addr.get(addr) + if fip is not None and not fip.port_id: + return fip + return None + + +def check_flavor_capacity( + conn: openstack.connection.Connection, + flavor_name: str, +) -> tuple[bool, str]: + """Pre-flight: ask Nova's placement API whether any host can satisfy this + flavor's resource request right now. + + Returns (ok, reason). When ok=False the orchestrator should defer the + spawn iteration; reason is a one-line human-readable explanation + (e.g. "no host can satisfy {'VCPU': 8, 'MEMORY_MB': 32768, 'VGPU': 1}"). + + If the placement API can't be queried for any reason we return + (True, "") and let Nova make the call. The + pre-flight is a fast-path optimization, not a gate — Nova still has the + final say at create_server time (and ERROR-status fallback handles + anything we miss). + """ + try: + flavor = conn.compute.find_flavor(flavor_name, ignore_missing=False) + except Exception as e: + return True, f"flavor lookup failed: {e}" + + # Standard resources every Nova flavor expresses. + resources: dict[str, int] = {} + if getattr(flavor, "vcpus", 0): + resources["VCPU"] = int(flavor.vcpus) + if getattr(flavor, "ram", 0): + resources["MEMORY_MB"] = int(flavor.ram) + if getattr(flavor, "disk", 0): + resources["DISK_GB"] = int(flavor.disk) + + # Custom / specialized resources (VGPU, PCI_*, CUSTOM_*) come from the + # flavor's extra_specs as `resources:=`. This is how Nova + # itself learns to ask placement for GPU capacity. + extra = getattr(flavor, "extra_specs", {}) or {} + for k, v in extra.items(): + if not k.startswith("resources:"): + continue + rc = k.split(":", 1)[1] + try: + resources[rc] = int(v) + except (TypeError, ValueError): + pass + + if not resources: + return True, "flavor expresses no resources — skipping placement check" + + try: + result = conn.placement.allocation_candidates( + resources=resources, limit=1, + ) + if hasattr(result, "allocation_requests"): + candidates = list(result.allocation_requests or []) + else: + candidates = list(result) + except Exception as e: + return True, f"placement query failed ({type(e).__name__}: {e})" + + if candidates: + return True, "" + return False, f"no host can satisfy {resources}" + + +def wait_for_server_active( + conn: openstack.connection.Connection, + server_id: str, + timeout_s: int = 300, + poll_interval_s: float = 3.0, +) -> Any: + """Poll Nova until the server is ACTIVE. Raise with full context if it + enters ERROR or never reaches ACTIVE in time. + + Nova surfaces the actual reason for an ERROR via the `fault` attribute + (message + code + details), so we log it verbatim. We also include + task_state / vm_state / power_state because Nova sometimes leaves the + fault empty and these tell you whether the failure was at scheduling, + networking, or block-device-mapping time. + """ + deadline = time.monotonic() + timeout_s + last_status = "?" + last_task = None + while time.monotonic() < deadline: + s = conn.compute.get_server(server_id) + status = getattr(s, "status", "UNKNOWN") or "UNKNOWN" + task = ( + getattr(s, "task_state", None) + or getattr(s, "OS-EXT-STS:task_state", None) + ) + if status != last_status or task != last_task: + log.info( + "server %s status=%s task_state=%s", server_id, status, task, + ) + last_status, last_task = status, task + + if status == "ACTIVE": + return s + + if status == "ERROR": + fault = getattr(s, "fault", None) or {} + vm_state = ( + getattr(s, "vm_state", None) + or getattr(s, "OS-EXT-STS:vm_state", None) + ) + power_state = ( + getattr(s, "power_state", None) + or getattr(s, "OS-EXT-STS:power_state", None) + ) + host = getattr(s, "compute_host", None) or getattr( + s, "OS-EXT-SRV-ATTR:host", None + ) + az = getattr(s, "availability_zone", None) or getattr( + s, "OS-EXT-AZ:availability_zone", None + ) + raise RuntimeError( + "server " + + str(server_id) + + " entered ERROR: " + + f"fault.code={fault.get('code')!r} " + + f"fault.message={fault.get('message')!r} " + + f"fault.details={fault.get('details')!r} " + + f"task_state={task!r} vm_state={vm_state!r} " + + f"power_state={power_state!r} host={host!r} az={az!r}" + ) + + time.sleep(poll_interval_s) + + raise RuntimeError( + f"server {server_id} did not reach ACTIVE within {timeout_s}s " + f"(last status={last_status!r} task_state={last_task!r})" + ) + + +def attach_floating_ip( + conn: openstack.connection.Connection, server_id: str, fip: Any +) -> str: + """Wait for the server to have a network port, then associate `fip`. + Returns the floating IP address.""" + for _ in range(60): # ~120s + ports = list(conn.network.ports(device_id=server_id)) + if ports: + break + time.sleep(2) + else: + raise RuntimeError(f"server {server_id} got no network port within 120s") + conn.network.update_ip(fip, port_id=ports[0].id) + return fip.floating_ip_address + + +def now_utc_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def parse_iso(s: str) -> datetime: + return datetime.fromisoformat(s) + + +class Orchestrator: + def __init__(self, config: dict, pat: str, state_path: str, template_path: str): + self.config = config + self.pat = pat + self.state_path = state_path + self.template_path = template_path + self.conn = openstack.connect(cloud=config.get("openstack_cloud", "airstack")) + self.repo = config["repo"] + self.runner_labels = config["runner_labels"] + self.runner_version = config["runner_version"] + self.max_concurrent = int(config.get("max_concurrent", 3)) + self.floating_ips: list[str] = list(config.get("floating_ips") or []) + # Cap spawns to FIP pool size so we never queue jobs we can't address. + self.effective_max_concurrent = self.max_concurrent + if self.floating_ips: + self.effective_max_concurrent = min( + self.max_concurrent, len(self.floating_ips) + ) + self.max_job_minutes = int(config.get("max_job_minutes", 90)) + self.spawn_interval = int(config.get("spawn_poll_interval_s", 15)) + self.reap_interval = int(config.get("reap_poll_interval_s", 30)) + self.stop_evt = threading.Event() + + def stop(self, *_: Any) -> None: + log.info("stop signal received; draining loops") + self.stop_evt.set() + + def spawn_once(self) -> None: + state = load_state(self.state_path) + active = len(state["jobs"]) + if active >= self.effective_max_concurrent: + return + try: + queued = find_queued_jobs(self.repo, self.runner_labels, self.pat) + except Exception as e: + log.warning("find_queued_jobs failed: %s", e) + return + + # Pre-flight capacity check via placement API. Every queued job uses + # the same flavor, so we check once per iteration. When OpenStack is + # out of GPUs / vCPU / RAM we defer the whole iteration — better than + # burning JIT tokens on creates that Nova will flip to ERROR. The + # next iteration retries automatically. + if queued: + ok, reason = check_flavor_capacity( + self.conn, self.config["flavor_name"] + ) + if not ok: + log.warning( + "deferring spawn — OpenStack capacity unavailable: %s. " + "Will retry in %ds.", + reason, self.spawn_interval, + ) + return + elif reason: + # Soft-skip path: placement check couldn't run (e.g. older + # Nova). Surface why so it's debuggable, then proceed. + log.debug("placement pre-flight: %s", reason) + + for job in queued: + if active >= self.effective_max_concurrent: + break + job_id = job["job_id"] + if job_id in state["jobs"]: + continue + + # Pre-check FIP availability before minting a JIT token so we + # don't burn one when there's nowhere to attach the worker. + reserved_fip = None + if self.floating_ips: + reserved_fip = find_free_floating_ip(self.conn, self.floating_ips) + if reserved_fip is None: + log.warning( + "no free floating IP in pool (%d configured); " + "deferring spawns until one frees up", + len(self.floating_ips), + ) + break + + ts = int(time.time()) + runner_name = f"ephemeral-{job_id}-{ts}" + server_id: str | None = None + try: + jit = mint_jit_config( + self.repo, runner_name, self.runner_labels, self.pat + ) + user_data = render_cloud_init( + self.template_path, jit, self.runner_version + ) + server_id = spawn_server( + self.conn, self.config, runner_name, job_id, user_data + ) + # Don't move on until Nova reports ACTIVE. If it transitions + # to ERROR, this raises with the Nova fault details so the + # operator can see *why* the spawn failed (quota, scheduling, + # block-device-mapping, networking, etc.). + wait_for_server_active( + self.conn, + server_id, + timeout_s=int(self.config.get("server_active_timeout_s", 300)), + ) + except Exception as e: + # Tag capacity-related Nova faults so log-grepping for + # "capacity unavailable" finds both the pre-flight defer and + # the post-create fallback (e.g. PCI passthrough that + # placement doesn't track). + msg = str(e).lower() + capacity_markers = ( + "no valid host", + "insufficient", + "quotaexceeded", + "out of resource", + "no host can satisfy", + "no allocation candidates", + ) + if any(m in msg for m in capacity_markers): + log.warning( + "spawn failed for job %s — OpenStack capacity " + "unavailable (post-create): %s. Will retry in %ds.", + job_id, e, self.spawn_interval, + ) + else: + log.exception("spawn failed for job %s: %s", job_id, e) + if server_id: + log.warning( + "deleting failed server %s to release its volume / FIP", + server_id, + ) + delete_server(self.conn, server_id) + continue + + floating_ip_addr: str | None = None + if reserved_fip is not None: + try: + floating_ip_addr = attach_floating_ip( + self.conn, server_id, reserved_fip + ) + log.info( + "attached floating IP %s to server %s (job %s)", + floating_ip_addr, server_id, job_id, + ) + except Exception as e: + log.exception( + "FIP attach failed for server %s; deleting to avoid " + "leaking a worker without external access: %s", + server_id, e, + ) + delete_server(self.conn, server_id) + continue + + state["jobs"][job_id] = { + "run_id": job["run_id"], + "server_id": server_id, + "runner_name": runner_name, + "spawned_at": now_utc_iso(), + "name": job["name"], + "floating_ip": floating_ip_addr, + } + save_state(self.state_path, state) + active += 1 + log.info( + "spawned server %s for job %s (%s)", server_id, job_id, job["name"] + ) + + def reap_once(self) -> None: + state = load_state(self.state_path) + now = datetime.now(timezone.utc) + + # 1. Delete servers for completed jobs. + for job_id in list(state["jobs"].keys()): + entry = state["jobs"][job_id] + try: + job = get_job_status(self.repo, job_id, self.pat) + except Exception as e: + log.warning("get_job_status(%s) failed: %s", job_id, e) + continue + if job is None or job.get("status") == "completed": + log.info("reaping server %s (job %s done)", entry["server_id"], job_id) + delete_server(self.conn, entry["server_id"]) + del state["jobs"][job_id] + continue + + # 2. Force-reap stragglers older than max_job_minutes. + spawned = parse_iso(entry["spawned_at"]) + age_min = (now - spawned).total_seconds() / 60.0 + if age_min > self.max_job_minutes: + log.warning( + "force-reaping server %s (job %s age %.1fm > %dm)", + entry["server_id"], job_id, age_min, self.max_job_minutes, + ) + delete_server(self.conn, entry["server_id"]) + del state["jobs"][job_id] + + save_state(self.state_path, state) + + # 3. Orphan sweep: any server we own that isn't in state and isn't + # in the brief just-spawned window. Catches state.json wipes and + # crashes between spawn and save_state. + try: + owned = list_owned_servers(self.conn) + except Exception as e: + log.warning("list_owned_servers failed: %s", e) + return + tracked_ids = {e["server_id"] for e in state["jobs"].values()} + for s in owned: + if s.id in tracked_ids: + continue + created = getattr(s, "created_at", None) + if created: + try: + age_min = (now - parse_iso(created.replace("Z", "+00:00"))).total_seconds() / 60.0 + except Exception: + age_min = self.max_job_minutes + 1 + else: + age_min = self.max_job_minutes + 1 + # Only reap orphans that have lived past one spawn interval + # (to avoid racing our own freshly-created server). + if age_min < 2: + continue + log.warning( + "orphan-reaping server %s (not in state, age %.1fm)", s.id, age_min + ) + delete_server(self.conn, s.id) + + def run(self) -> None: + log.info( + "orchestrator started: repo=%s labels=%s max_concurrent=%d " + "(effective=%d, floating_ip_pool=%d)", + self.repo, self.runner_labels, self.max_concurrent, + self.effective_max_concurrent, len(self.floating_ips), + ) + last_spawn = 0.0 + last_reap = 0.0 + while not self.stop_evt.is_set(): + now = time.monotonic() + if now - last_spawn >= self.spawn_interval: + try: + self.spawn_once() + except Exception: + log.exception("spawn loop iteration failed") + last_spawn = now + if now - last_reap >= self.reap_interval: + try: + self.reap_once() + except Exception: + log.exception("reap loop iteration failed") + last_reap = now + self.stop_evt.wait(timeout=1.0) + log.info("orchestrator stopped") + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--config", default=DEFAULT_CONFIG_PATH) + parser.add_argument("--pat", default=DEFAULT_PAT_PATH) + parser.add_argument("--state", default=DEFAULT_STATE_PATH) + parser.add_argument("--template", default=DEFAULT_TEMPLATE_PATH) + parser.add_argument("--log-level", default="INFO") + args = parser.parse_args() + + logging.basicConfig( + level=getattr(logging, args.log_level.upper()), + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + stream=sys.stdout, + ) + + config = load_yaml(args.config) + pat = load_pat(args.pat) + + orch = Orchestrator(config, pat, args.state, args.template) + signal.signal(signal.SIGINT, orch.stop) + signal.signal(signal.SIGTERM, orch.stop) + orch.run() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/orchestrator/requirements.txt b/.github/orchestrator/requirements.txt new file mode 100644 index 000000000..b69702b59 --- /dev/null +++ b/.github/orchestrator/requirements.txt @@ -0,0 +1,4 @@ +openstacksdk>=3.0,<5 +requests>=2.31 +PyYAML>=6.0 +Jinja2>=3.1 diff --git a/.github/orchestrator/setup.sh b/.github/orchestrator/setup.sh new file mode 100755 index 000000000..4803b4a33 --- /dev/null +++ b/.github/orchestrator/setup.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# One-time orchestrator-VM setup. Run as root on the airstack-ci-cd-orchestrator +# OpenStack instance after cloning the repo. +# +# Pre-reqs (do these *before* running this script): +# 1. ~/.config/openstack/clouds.yaml staged for the orchestrator user +# (application credential — see .github/orchestrator/README.md). +# 2. /tmp/github-pat exists with the GitHub PAT contents. +# 3. This repo cloned somewhere readable (this script copies code out of +# its containing directory). + +set -euo pipefail + +INSTALL_DIR=/opt/airstack-orchestrator +CONFIG_DIR=/etc/airstack-orchestrator +STATE_DIR=/var/lib/airstack-orchestrator +USER_NAME=orchestrator + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [[ $EUID -ne 0 ]]; then + echo "ERROR: setup.sh must run as root" >&2 + exit 1 +fi + +echo "==> Creating orchestrator user" +if ! id "$USER_NAME" >/dev/null 2>&1; then + useradd --system --create-home --shell /usr/sbin/nologin "$USER_NAME" +fi + +echo "==> Installing system packages" +apt-get update +apt-get install -y python3 python3-venv python3-pip + +echo "==> Creating directories" +install -d -o "$USER_NAME" -g "$USER_NAME" -m 0750 "$INSTALL_DIR" +install -d -o root -g "$USER_NAME" -m 0750 "$CONFIG_DIR" +install -d -o "$USER_NAME" -g "$USER_NAME" -m 0750 "$STATE_DIR" + +echo "==> Copying orchestrator files to $INSTALL_DIR" +install -o "$USER_NAME" -g "$USER_NAME" -m 0755 \ + "$REPO_DIR/orchestrator.py" "$INSTALL_DIR/orchestrator.py" +install -o "$USER_NAME" -g "$USER_NAME" -m 0644 \ + "$REPO_DIR/cloud-init.yaml.j2" "$INSTALL_DIR/cloud-init.yaml.j2" + +echo "==> Building Python venv" +sudo -u "$USER_NAME" python3 -m venv "$INSTALL_DIR/venv" +sudo -u "$USER_NAME" "$INSTALL_DIR/venv/bin/pip" install --upgrade pip +sudo -u "$USER_NAME" "$INSTALL_DIR/venv/bin/pip" install -r "$REPO_DIR/requirements.txt" + +echo "==> Staging config (if not present)" +if [[ ! -f "$CONFIG_DIR/config.yaml" ]]; then + install -o root -g "$USER_NAME" -m 0640 \ + "$REPO_DIR/config.example.yaml" "$CONFIG_DIR/config.yaml" + echo " config.yaml installed from example — edit before starting service" +fi + +echo "==> Installing GitHub PAT (from /tmp/github-pat)" +if [[ ! -f /tmp/github-pat ]]; then + echo "ERROR: /tmp/github-pat not found. scp it over before running setup." >&2 + exit 1 +fi +install -o root -g "$USER_NAME" -m 0640 /tmp/github-pat "$CONFIG_DIR/github-pat" +shred -u /tmp/github-pat + +echo "==> Verifying clouds.yaml" +CLOUDS_YAML="/home/$USER_NAME/.config/openstack/clouds.yaml" +if [[ ! -f "$CLOUDS_YAML" ]]; then + echo "WARNING: $CLOUDS_YAML missing." >&2 + echo " Create it (application credential) before starting the service." >&2 +fi + +echo "==> Installing systemd unit" +install -o root -g root -m 0644 \ + "$REPO_DIR/airstack-orchestrator.service" \ + /etc/systemd/system/airstack-orchestrator.service +systemctl daemon-reload + +echo +echo "Setup complete. Next steps:" +echo " 1. Edit $CONFIG_DIR/config.yaml — fill flavor/network/keypair/security_group." +echo " 2. Verify $CLOUDS_YAML exists with the application credential." +echo " 3. systemctl enable --now airstack-orchestrator.service" +echo " 4. journalctl -u airstack-orchestrator.service -f" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index bd565c0bb..973f0a7d3 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,18 +1,53 @@ -# What does this pull request do? +# What features did you add and/or bugs did you address? +- Which GitHub issue does this address? -Which issue number does this address? +- Additional description if not fully described in the GitHub issue -Add videos and images if possible. +- Please add videos and images to demonstrate the feature. Please upload videos to somewhere persistent (e.g. YouTube or Vimeo) for archival purposes. # How did you implement it? +- Algorithm details, design decisions, engineering notes, and any other relevant information about the implementation should be included -# Testing -**How do you run the tests?** +# How do you run and use it? +- What commands and button presses do you use to manually launch the stack to use your new feature? -**What do the tests do?** +- Write a detailed procedure with EXACT BASH COMMANDS so that another maintainer can replicate and understand the benefits of your feature, and reproduce the videos and images you added above. -**What are the expected results of the tests?** -# Did you update the docs (and where)? +# Testing with PyTest + -(FYI Docs are updated via mkdocs.yml and markdown files under `docs/`. It should render at localhost:8000 when you run `docker compose up docs`.) +- What pytests did you add to ensure the feature is reliable and robust? What metrics are used? + +- What's the exact command to run the pytests that test your feature? i.e. `airstack test -m ...` + +- What are the expected results of the tests? What should a maintainer look at to understand whether the test succeeded? + + + + +# Documentation + +- Was mkdocs.yml updated? (y/n) + +- Do the docs have sufficient scope such that a newcomer can easily reproduce and use your feature? + +- Is there sufficient visual media? + + + +# Versioning +- Did you make sure to bump the [version number](https://github.com/castacks/AirStack/blob/main/.env#L15) in the `.env` file according to [semantic versioning](https://semver.org/)? + + diff --git a/.github/runners/airstack-runner.service b/.github/runners/airstack-runner.service deleted file mode 100644 index 89cfe4d40..000000000 --- a/.github/runners/airstack-runner.service +++ /dev/null @@ -1,26 +0,0 @@ -[Unit] -Description=AirStack GitHub Actions Runner (ephemeral) -Documentation=https://docs.github.com/en/actions/hosting-your-own-runners -After=network-online.target docker.service -Wants=network-online.target - -[Service] -User=runner -Group=runner -WorkingDirectory=/opt/actions-runner - -# Place runtime configuration here (REPO_URL, RUNNER_LABELS, etc.). -# Each line: KEY=value (no quotes needed, no export keyword) -EnvironmentFile=/etc/github-runner-env - -ExecStart=/opt/actions-runner/register-runner.sh - -# Restart unconditionally so the loop survives transient API errors or reboots. -Restart=always -RestartSec=5 - -# Give the runner enough time to finish a long job before systemd kills it. -TimeoutStopSec=120 - -[Install] -WantedBy=multi-user.target diff --git a/.github/runners/register-runner.sh b/.github/runners/register-runner.sh deleted file mode 100644 index 1b5e0f3ac..000000000 --- a/.github/runners/register-runner.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env bash -# AirStack ephemeral GitHub Actions runner loop. -# -# Registers a fresh runner, executes exactly one job, then loops to re-register. -# The --ephemeral flag tells the GitHub API to remove the runner after one job, -# preventing stale registrations and cross-job state pollution. -# -# Setup (one-time, on the OpenStack VM): -# 1. Create a non-root runner user: -# sudo useradd -m -s /bin/bash runner -# sudo usermod -aG docker runner -# -# 2. Download and unpack the GitHub Actions runner into RUNNER_DIR: -# sudo mkdir -p /opt/actions-runner -# cd /opt/actions-runner -# # Get the latest runner URL from: -# # https://github.com/actions/runner/releases -# curl -Lo actions-runner.tar.gz -# tar xzf actions-runner.tar.gz -# sudo chown -R runner:runner /opt/actions-runner -# -# 3. Store a GitHub PAT (repo scope for private repos, public_repo for public): -# echo "ghp_YOUR_TOKEN_HERE" | sudo tee /etc/github-runner-pat -# sudo chmod 600 /etc/github-runner-pat -# sudo chown runner:runner /etc/github-runner-pat -# -# 4. Copy this script into the runner directory and make it executable: -# sudo cp register-runner.sh /opt/actions-runner/register-runner.sh -# sudo chown runner:runner /opt/actions-runner/register-runner.sh -# sudo chmod +x /opt/actions-runner/register-runner.sh -# -# 5. Install the systemd unit (see airstack-runner.service) and enable it: -# sudo cp airstack-runner.service /etc/systemd/system/ -# sudo systemctl daemon-reload -# sudo systemctl enable --now airstack-runner.service -# -# Configuration: set these in /etc/github-runner-env (loaded by the systemd unit) -# or export them before running this script manually. - -set -euo pipefail - -REPO_URL="${REPO_URL:-https://github.com/YOUR_ORG/AirStack}" -# Derived from REPO_URL for the registration token API call, e.g. "YOUR_ORG/AirStack" -REPO_PATH="${REPO_PATH:-$(echo "$REPO_URL" | sed 's|https://github.com/||')}" -RUNNER_DIR="${RUNNER_DIR:-/opt/actions-runner}" -PAT_FILE="${PAT_FILE:-/etc/github-runner-pat}" -RUNNER_LABELS="${RUNNER_LABELS:-self-hosted,airstack,gpu}" -RUNNER_GROUP="${RUNNER_GROUP:-Default}" - -if [ ! -f "$PAT_FILE" ]; then - echo "ERROR: PAT file not found at $PAT_FILE" >&2 - exit 1 -fi - -echo "Starting ephemeral runner loop for $REPO_URL" - -while true; do - echo "[$(date -u +%FT%TZ)] Requesting registration token..." - - TOKEN=$(curl -sf -X POST \ - -H "Authorization: token $(cat "$PAT_FILE")" \ - -H "Accept: application/vnd.github+json" \ - "https://api.github.com/repos/${REPO_PATH}/actions/runners/registration-token" \ - | jq -r .token) - - if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then - echo "ERROR: Failed to obtain registration token. Check PAT and repo path." >&2 - sleep 30 - continue - fi - - echo "[$(date -u +%FT%TZ)] Configuring runner (ephemeral)..." - - # --ephemeral: runner de-registers itself after completing one job. - # --replace: allows re-registration with the same name after a restart. - # Runner name encodes hostname + PID so parallel instances are unique. - "$RUNNER_DIR/config.sh" \ - --url "$REPO_URL" \ - --token "$TOKEN" \ - --name "openstack-$(hostname -s)-$$" \ - --labels "$RUNNER_LABELS" \ - --runnergroup "$RUNNER_GROUP" \ - --ephemeral \ - --unattended \ - --replace - - echo "[$(date -u +%FT%TZ)] Runner configured. Waiting for a job..." - - # run.sh blocks until the job completes, then returns (ephemeral runner exits cleanly). - "$RUNNER_DIR/run.sh" || true - - echo "[$(date -u +%FT%TZ)] Job finished. Re-registering..." - - # Brief pause to avoid hammering the API if config.sh / run.sh fail immediately. - sleep 2 -done diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index d6bb659d8..add5f5953 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -5,6 +5,16 @@ on: branches: [ main, develop ] # Adjust branches as needed paths: - '.env' + workflow_dispatch: + inputs: + compose_profiles: + description: 'Docker Compose profiles to build (comma-separated)' + required: false + default: 'desktop,isaac-sim,ms-airsim' + type: string + +env: + DEFAULT_PROFILES: 'desktop,isaac-sim,ms-airsim' jobs: check-docker-tag-change: @@ -16,6 +26,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 2 # Fetch current and previous commit + submodules: recursive - name: Check if VERSION changed id: check-changes @@ -40,20 +51,29 @@ jobs: docker-build: needs: check-docker-tag-change - if: needs.check-docker-tag-change.outputs.tag-changed == 'true' - runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' || needs.check-docker-tag-change.outputs.tag-changed == 'true' + runs-on: [self-hosted, airstack-ephemeral] + timeout-minutes: 120 + permissions: + contents: read + id-token: write # Required for keyless cosign signing via GitHub OIDC steps: - name: Checkout code uses: actions/checkout@v4 + with: + submodules: recursive - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + - name: Install Cosign + uses: sigstore/cosign-installer@v3 + - name: Log in to Docker Registry uses: docker/login-action@v3 with: - registry: ${{ secrets.DOCKER_REGISTRY_URL }} # e.g., your-registry.com or leave empty for Docker Hub - username: ${{ secrets.DOCKER_REGISTRY_USERNAME }} + registry: ${{ vars.DOCKER_REGISTRY_URL }} # e.g., your-registry.com or leave empty for Docker Hub + username: ${{ vars.DOCKER_REGISTRY_USERNAME }} password: ${{ secrets.DOCKER_REGISTRY_PASSWORD }} - name: Verify .env file and extract tag @@ -63,6 +83,12 @@ jobs: echo "Error: .env file not found" exit 1 fi + + - name: Create dummy omni_pass.env + run: | + # Some compose files expect this file to exist, even if it is empty. + mkdir -p simulation/isaac-sim/docker + : > simulation/isaac-sim/docker/omni_pass.env # Display the current VERSION for debugging DOCKER_TAG=$(grep "^VERSION=" .env | cut -d '=' -f2- | tr -d '"' | tr -d "'") @@ -80,6 +106,13 @@ jobs: source .env set +a # Stop exporting + # Always override COMPOSE_PROFILES for all trigger types + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + export COMPOSE_PROFILES="${{ github.event.inputs.compose_profiles || env.DEFAULT_PROFILES }}" + else + export COMPOSE_PROFILES="${{ env.DEFAULT_PROFILES }}" + fi + docker compose build - name: Run Docker Compose Push @@ -89,8 +122,71 @@ jobs: source .env set +a # Stop exporting + # Always override COMPOSE_PROFILES for all trigger types + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + export COMPOSE_PROFILES="${{ github.event.inputs.compose_profiles || env.DEFAULT_PROFILES }}" + else + export COMPOSE_PROFILES="${{ env.DEFAULT_PROFILES }}" + fi + docker compose push + - name: Sign pushed images with Cosign (keyless) + env: + COSIGN_YES: "true" + run: | + set -a + source .env + set +a + + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + export COMPOSE_PROFILES="${{ github.event.inputs.compose_profiles || env.DEFAULT_PROFILES }}" + else + export COMPOSE_PROFILES="${{ env.DEFAULT_PROFILES }}" + fi + + IMAGES=$(docker compose config --images | sort -u) + if [ -z "$IMAGES" ]; then + echo "No images resolved from compose config; nothing to sign." + exit 1 + fi + + for IMG in $IMAGES; do + DIGEST=$(docker buildx imagetools inspect "$IMG" --format '{{.Manifest.Digest}}') + if [ -z "$DIGEST" ]; then + echo "Error: could not resolve digest for $IMG" + exit 1 + fi + REPO="${IMG%:*}" + REF="${REPO}@${DIGEST}" + echo "Signing $REF" + cosign sign "$REF" + done + + - name: Verify Cosign signatures + run: | + set -a + source .env + set +a + + if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + export COMPOSE_PROFILES="${{ github.event.inputs.compose_profiles || env.DEFAULT_PROFILES }}" + else + export COMPOSE_PROFILES="${{ env.DEFAULT_PROFILES }}" + fi + + IMAGES=$(docker compose config --images | sort -u) + for IMG in $IMAGES; do + DIGEST=$(docker buildx imagetools inspect "$IMG" --format '{{.Manifest.Digest}}') + REPO="${IMG%:*}" + REF="${REPO}@${DIGEST}" + echo "Verifying $REF" + cosign verify "$REF" \ + --certificate-identity-regexp "^https://github\\.com/${{ github.repository }}/\\.github/workflows/docker-build\\.yml@" \ + --certificate-oidc-issuer "https://token.actions.githubusercontent.com" \ + > /dev/null + done + - name: Optional - Run Docker Compose Up (uncomment if needed) run: | # Uncomment the following lines if you also want to start the services @@ -101,7 +197,7 @@ jobs: notify: needs: [check-docker-tag-change, docker-build] - if: always() && needs.check-docker-tag-change.outputs.tag-changed == 'true' + if: always() && (github.event_name == 'workflow_dispatch' || needs.check-docker-tag-change.outputs.tag-changed == 'true') runs-on: ubuntu-latest steps: - name: Notify build and push result diff --git a/.github/workflows/enforce-branch-targets.yaml b/.github/workflows/enforce-branch-targets.yaml new file mode 100644 index 000000000..bf156658d --- /dev/null +++ b/.github/workflows/enforce-branch-targets.yaml @@ -0,0 +1,30 @@ +name: Enforce Branch Targets + +on: + pull_request: + branches: [main, develop] + +jobs: + check-target: + runs-on: ubuntu-latest + steps: + - name: Enforce PR target rules + run: | + BASE="${{ github.base_ref }}" + HEAD="${{ github.head_ref }}" + + if [[ "$BASE" == "main" ]]; then + if [[ "$HEAD" != hotfix/* && "$HEAD" != "develop" ]]; then + echo "❌ Only hotfix/* branches or develop can be PR'd to main. Got: $HEAD" + exit 1 + fi + fi + + if [[ "$BASE" == "develop" ]]; then + if [[ "$HEAD" == hotfix/* ]]; then + echo "❌ Hotfix branches should target main, not develop. Got: $HEAD" + exit 1 + fi + fi + + echo "✅ Branch target looks good." \ No newline at end of file diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml deleted file mode 100644 index b90643c75..000000000 --- a/.github/workflows/integration-tests.yml +++ /dev/null @@ -1,221 +0,0 @@ -name: Integration Tests - -on: - pull_request: - branches: [main, develop] - workflow_dispatch: - inputs: - marks: - description: "pytest marks expression (e.g. liveliness, build_docker, 'liveliness or build_docker')" - default: liveliness - required: false - sim: - description: "Sim targets, comma-separated: msairsim, isaacsim" - default: msairsim - required: false - num_robots: - description: "Robot counts, comma-separated (e.g. 1,3)" - default: "1" - required: false - stress_iterations: - description: "Iterations per (sim, num_robots) config" - default: "1" - required: false - stable_duration: - description: "Seconds for test_stable polling window" - default: "120" - required: false - baseline_run_id: - description: "Run ID to use as baseline for metric comparison (blank = latest successful run on main)" - default: "" - required: false - -jobs: - run-tests: - name: Run Tests - runs-on: [self-hosted, airstack, gpu] - # Only run on PRs from the same repo (not forks) to prevent arbitrary code - # execution on the self-hosted runner from untrusted contributors. - if: > - github.event_name == 'workflow_dispatch' || - github.event.pull_request.head.repo.full_name == github.repository - timeout-minutes: 120 - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Install test dependencies - run: pip3 install -r tests/requirements.txt - - - name: Ensure airstack.sh is executable - run: chmod +x airstack.sh - - - name: Run tests - env: - AIRSTACK_ROOT: ${{ github.workspace }} - DISPLAY: "" - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - MARKS="${{ inputs.marks }}" - SIM="${{ inputs.sim }}" - NUM_ROBOTS="${{ inputs.num_robots }}" - ITERATIONS="${{ inputs.stress_iterations }}" - STABLE="${{ inputs.stable_duration }}" - else - MARKS="build_docker or build_packages" - SIM="msairsim" - NUM_ROBOTS="1" - ITERATIONS="1" - STABLE="120" - fi - - pytest tests/ \ - -m "$MARKS" \ - --sim "$SIM" \ - --num-robots "$NUM_ROBOTS" \ - --stress-iterations "$ITERATIONS" \ - --stable-duration "$STABLE" \ - -v - - - name: Upload test results - uses: actions/upload-artifact@v4 - if: always() - with: - name: test-results-${{ github.sha }}-${{ github.run_id }} - path: tests/results/ - retention-days: 90 - - report: - name: Metrics Report - runs-on: ubuntu-latest - needs: run-tests - if: always() - permissions: - pull-requests: write - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - - name: Install report dependencies - run: pip install tabulate - - - name: Download current test results - uses: actions/download-artifact@v4 - with: - name: test-results-${{ github.sha }}-${{ github.run_id }} - path: current-results/ - - # PR mode: fetch latest artifact from the base branch (e.g. develop or main) - - name: Download baseline results (PR) - if: github.event_name == 'pull_request' - uses: dawidd6/action-download-artifact@v6 - continue-on-error: true - with: - workflow: integration-tests.yml - branch: ${{ github.base_ref }} - name_is_regexp: true - name: "test-results-.*" - path: baseline-results/ - if_no_artifact_found: warn - - # Manual dispatch with explicit baseline run ID - - name: Download baseline results (manual, explicit run ID) - if: > - github.event_name == 'workflow_dispatch' && - inputs.baseline_run_id != '' - uses: actions/download-artifact@v4 - continue-on-error: true - with: - run-id: ${{ inputs.baseline_run_id }} - name_is_regexp: true - name: "test-results-.*" - path: baseline-results/ - - # Manual dispatch without explicit baseline: fetch latest from main - - name: Download baseline results (manual, latest main) - if: > - github.event_name == 'workflow_dispatch' && - inputs.baseline_run_id == '' - uses: dawidd6/action-download-artifact@v6 - continue-on-error: true - with: - workflow: integration-tests.yml - branch: main - name_is_regexp: true - name: "test-results-.*" - path: baseline-results/ - if_no_artifact_found: warn - - - name: Locate result directories - id: dirs - run: | - CURRENT=$(ls current-results/ 2>/dev/null | sort -r | head -1) - echo "current=current-results/$CURRENT" >> "$GITHUB_OUTPUT" - - BASELINE=$(ls baseline-results/ 2>/dev/null | sort -r | head -1) - if [ -n "$BASELINE" ]; then - echo "baseline=baseline-results/$BASELINE" >> "$GITHUB_OUTPUT" - else - echo "baseline=" >> "$GITHUB_OUTPUT" - fi - - - name: Generate metrics report - id: report - continue-on-error: true - run: | - CURRENT="${{ steps.dirs.outputs.current }}" - BASELINE="${{ steps.dirs.outputs.baseline }}" - - if [ -n "$BASELINE" ]; then - python tests/parse_metrics.py \ - --current "$CURRENT" \ - --baseline "$BASELINE" \ - --output report.md - else - python tests/parse_metrics.py \ - --current "$CURRENT" \ - --output report.md - fi - - - name: Post PR comment - if: github.event_name == 'pull_request' - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - let body; - try { - body = fs.readFileSync('report.md', 'utf8'); - } catch { - body = '_No metrics report generated._'; - } - const header = `## Test Metrics — \`${{ github.sha }}\`\n\n`; - await github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: header + body, - }); - - - name: Write job summary - if: always() - run: | - if [ -f report.md ]; then - echo "## Test Metrics — \`${{ github.sha }}\`" >> "$GITHUB_STEP_SUMMARY" - echo "" >> "$GITHUB_STEP_SUMMARY" - cat report.md >> "$GITHUB_STEP_SUMMARY" - else - echo "_No metrics report generated._" >> "$GITHUB_STEP_SUMMARY" - fi - - - name: Fail on regression - if: steps.report.outcome == 'failure' - run: | - echo "::error::Metric regression detected — see the report above for details." - exit 1 diff --git a/.github/workflows/sync-develop-from-main.yaml b/.github/workflows/sync-develop-from-main.yaml new file mode 100644 index 000000000..e875d8a43 --- /dev/null +++ b/.github/workflows/sync-develop-from-main.yaml @@ -0,0 +1,119 @@ +name: Sync main → develop + +on: + push: + branches: [main] + workflow_dispatch: + +jobs: + sync: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: develop + token: ${{ secrets.SYNC_PAT }} + + - name: Configure git + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + - name: Merge main into develop + run: | + git fetch origin main + + # Merge main. If the only conflict is .env (VERSION line), keep + # develop's side — we overwrite it in the next step anyway. Any + # other conflict means a human needs to resolve manually. + if ! git merge origin/main --no-edit; then + CONFLICTS=$(git diff --name-only --diff-filter=U) + OTHER=$(echo "$CONFLICTS" | grep -v '^\.env$' || true) + if [ -n "$OTHER" ]; then + echo "::error::Non-.env merge conflicts; resolve sync manually." + echo "$CONFLICTS" + exit 1 + fi + git checkout --ours .env + git add .env + git -c core.editor=true commit --no-edit + fi + + # If develop already contained everything main has, nothing to do. + if git diff --quiet origin/develop..HEAD; then + echo "develop already up to date with main; nothing to sync." + echo "SKIP=true" >> $GITHUB_ENV + fi + + - name: Bump develop VERSION + if: env.SKIP != 'true' + run: | + python3 << 'PYEOF' + import os, re, subprocess, sys + + SEMVER_RE = re.compile(r'^(\d+)\.(\d+)\.(\d+)(?:-(alpha|beta|rc)\.(\d+))?$') + + def parse(v): + m = SEMVER_RE.fullmatch(v) + if not m: + sys.exit(f"::error::Invalid VERSION format: {v!r}") + return int(m[1]), int(m[2]), int(m[3]), m[4], int(m[5]) if m[5] else None + + def extract(content): + for line in content.splitlines(): + m = re.match(r'^VERSION\s*=\s*["\']?([^"\'#\s]+)', line) + if m: + return m.group(1) + return None + + main_env = subprocess.check_output(['git', 'show', 'origin/main:.env'], text=True) + main_ver = extract(main_env) + with open('.env') as f: + dev_ver = extract(f.read()) + + mM, mm, mp, _, _ = parse(main_ver) + dM, dn, dp, dpre, dpn = parse(dev_ver) + + # If main's x.y.z has caught up to (or passed) develop's base, + # develop just released — roll forward to the next minor's alpha.0. + # Otherwise main is a hotfix behind develop; preserve develop's + # pre-release channel and bump the counter. + if (mM, mm, mp) >= (dM, dn, dp): + new_ver = f"{mM}.{mm + 1}.0-alpha.0" + reason = "main caught up to develop's base — rolling to next minor" + else: + if dpre is None: + sys.exit( + f"::error::develop VERSION {dev_ver!r} has no pre-release " + "tag; cannot auto-bump. Bump develop manually." + ) + new_ver = f"{dM}.{dn}.{dp}-{dpre}.{dpn + 1}" + reason = f"hotfix sync — bumping {dpre} counter" + + print(f"main: {main_ver}") + print(f"develop: {dev_ver}") + print(f"new: {new_ver} ({reason})") + + with open('.env') as f: + content = f.read() + new_content = re.sub( + r'^(VERSION\s*=\s*["\']?)[^"\'#\s]+', + lambda m: m.group(1) + new_ver, + content, count=1, flags=re.MULTILINE, + ) + with open('.env', 'w') as f: + f.write(new_content) + + with open(os.environ['GITHUB_ENV'], 'a') as f: + f.write(f"NEW_VERSION={new_ver}\n") + PYEOF + + git add .env + git commit -m "Bump VERSION to ${NEW_VERSION} after sync from main" + + - name: Push to develop + if: env.SKIP != 'true' + run: git push origin develop diff --git a/.github/workflows/system-tests.yml b/.github/workflows/system-tests.yml new file mode 100644 index 000000000..3ddebda58 --- /dev/null +++ b/.github/workflows/system-tests.yml @@ -0,0 +1,529 @@ +name: System Tests + +on: + pull_request: + types: [opened] + issue_comment: + types: [created] + workflow_dispatch: + inputs: + marks: + description: "pytest marks expression (e.g. 'build_docker', 'liveliness', 'takeoff_hover_land'). \ + Use 'or' to combine marks: 'liveliness or takeoff_hover_land'. Leave blank to run all marks. \ + Note: 'build_packages' is automatically prepended whenever any marks are specified, \ + to ensure code is built before launch tests run." + default: "liveliness or takeoff_hover_land" + required: false + sim: + description: "Sim targets, comma-separated: msairsim,isaacsim" + default: msairsim,isaacsim + required: false + num_robots: + description: "Robot counts, comma-separated (e.g. 1,3)" + default: "1" + required: false + stress_iterations: + description: "Iterations per (sim, num_robots) config" + default: "1" + required: false + stable_duration: + description: "Seconds for test_stable polling window" + default: "120" + required: false + baseline_run_id: + description: "Run ID to use as baseline for metric comparison (blank = latest successful run on main)" + default: "" + required: false + +jobs: + run-tests: + name: Run Tests + runs-on: [self-hosted, airstack-ephemeral] + # Triggers: + # - workflow_dispatch (manual) + # - PR opened from the same repo (not a fork) — same-repo guard + # prevents arbitrary code execution on the self-hosted runner from + # untrusted contributors. + # - PR comment starting with `/pytest` from a user with write access + # (OWNER/MEMBER/COLLABORATOR). issue_comment fires for both issues + # and PRs; `issue.pull_request` disambiguates. The author_association + # gate is what keeps random commenters from running code on the + # self-hosted runner. + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + github.event.pull_request.head.repo.full_name == github.repository) || + (github.event_name == 'issue_comment' && + github.event.issue.pull_request != null && + startsWith(github.event.comment.body, '/pytest') && + contains(fromJSON('["OWNER","MEMBER","COLLABORATOR"]'), github.event.comment.author_association)) + timeout-minutes: 120 + # Adding any `permissions:` entry disables GITHUB_TOKEN's defaults, so + # every scope used here has to be re-granted explicitly: + # checks:write — create/update the Check Run on the PR head + # contents:read — actions/checkout + # pull-requests:write — post the acknowledgment comment on the PR. + # Even though the endpoint is /issues/{n}/comments, comments on + # PRs are gated by the pull-requests permission, not issues — the + # `x-accepted-github-permissions` header lists both as alternatives + # but only pull-requests:write actually works for PR comments. + permissions: + checks: write + contents: read + pull-requests: write + # Mirror the registry password into env so step-level `if:` expressions + # can check whether registry-cache mode is available — `secrets.*` itself + # is not addressable from `if:` expressions. + env: + DOCKER_REGISTRY_PASSWORD: ${{ secrets.DOCKER_REGISTRY_PASSWORD }} + steps: + # Uses actions/github-script (Node, bundled with the runner) instead + # of `gh` so we don't depend on system tools — the ephemeral + # self-hosted runner doesn't have gh/jq installed. + - name: Resolve PR head + if: github.event_name == 'issue_comment' + id: pr + uses: actions/github-script@v7 + with: + script: | + const pr = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.issue.number, + }); + // Even though the commenter has write access, the PR's code + // lives on the head repo. If that's a fork, we'd be running + // untrusted code on the self-hosted runner. + const headRepo = pr.data.head.repo.full_name; + const expected = `${context.repo.owner}/${context.repo.repo}`; + if (headRepo !== expected) { + core.setFailed(`PR #${context.issue.number} is from a fork (${headRepo}); /pytest is not supported for forks.`); + return; + } + core.setOutput('head_sha', pr.data.head.sha); + core.setOutput('base_ref', pr.data.base.ref); + + # Parsed up-front (before checkout) so the acknowledgment comment below + # can echo the resolved args. This step only reads env vars, so it + # doesn't need the working tree. + - name: Parse pytest args + id: parse + env: + GH_EVENT_NAME: ${{ github.event_name }} + COMMENT_BODY: ${{ github.event.comment.body }} + INPUT_MARKS: ${{ inputs.marks }} + INPUT_SIM: ${{ inputs.sim }} + INPUT_NUM_ROBOTS: ${{ inputs.num_robots }} + INPUT_ITERATIONS: ${{ inputs.stress_iterations }} + INPUT_STABLE: ${{ inputs.stable_duration }} + run: | + python3 <<'PYEOF' + import os, shlex, sys + + event = os.environ['GH_EVENT_NAME'] + if event == 'workflow_dispatch': + args = [] + if (m := os.environ.get('INPUT_MARKS', '').strip()): + args.extend(['-m', m]) + if (s := os.environ.get('INPUT_SIM', '').strip()): + args.extend(['--sim', s]) + if (n := os.environ.get('INPUT_NUM_ROBOTS', '').strip()): + args.extend(['--num-robots', n]) + if (it := os.environ.get('INPUT_ITERATIONS', '').strip()): + args.extend(['--stress-iterations', it]) + if (st := os.environ.get('INPUT_STABLE', '').strip()): + args.extend(['--stable-duration', st]) + elif event == 'pull_request': + # PR-opened auto-run uses pytest's conftest defaults — same as + # /pytest with no args. + args = [] + else: + body = os.environ.get('COMMENT_BODY', '') + # Only the first line is parsed — everything below it is + # treated as freeform comment text (notes, context, etc.). + first_line = (body.splitlines()[0] if body else '').strip() + if not first_line.startswith('/pytest'): + print('::error::Comment does not start with /pytest', file=sys.stderr) + sys.exit(1) + args_line = first_line[len('/pytest'):].strip() + try: + args = shlex.split(args_line) + except ValueError as e: + print(f'::error::Could not parse pytest args from comment: {e}', file=sys.stderr) + sys.exit(1) + + # Pull out --sim and -m so the image-prep step can scope profiles + # and decide whether to skip (build_docker tests rebuild themselves). + # When --sim isn't given we mirror conftest's default so prep covers + # whatever pytest will actually exercise. + sim = 'msairsim,isaacsim' + marks = '' + marks_idx = -1 + for i, a in enumerate(args): + if a == '--sim' and i + 1 < len(args): + sim = args[i + 1] + elif a == '-m' and i + 1 < len(args): + marks = args[i + 1] + marks_idx = i + 1 + + # When the user specified any marks, prepend build_packages so code + # is built before launch tests try to use it. Skipped when no marks + # are given (pytest runs everything including build_packages) and + # when build_packages is already in the expression. + if marks and 'build_packages' not in marks: + marks = f'build_packages or {marks}' + args[marks_idx] = marks + + skip_prep = 'build_docker' in marks + quoted = ' '.join(shlex.quote(a) for a in args) + + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f'pytest_args={quoted}\n') + f.write(f'sim={sim}\n') + f.write(f'skip_image_prep={"true" if skip_prep else "false"}\n') + + print(f'Resolved pytest args: {quoted or "(none — pytest defaults)"}') + print(f'Resolved sim profile: {sim}') + print(f'Skip image prep: {skip_prep}') + PYEOF + + # Reply on the PR thread so the commenter sees their /pytest was + # picked up and can confirm we parsed the args correctly. The + # workflow_dispatch path skips this (no PR to comment on); the + # pull_request-opened path skips it too (the PR Checks tab is + # already showing the native run). + - name: Post acknowledgment comment + if: github.event_name == 'issue_comment' + uses: actions/github-script@v7 + with: + script: | + const args = ${{ toJSON(steps.parse.outputs.pytest_args) }}; + const cmd = `pytest tests/ ${args}`.trim(); + const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + const note = `Note: \`build_packages\` is automatically prepended whenever any marks are specified, to ensure code is built before launch tests run.`; + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: `Running \`${cmd}\` — [view run](${runUrl}). Status will appear as a check on this PR.\n\n${note}`, + }); + + # Pin a Check Run on the PR's head SHA so the run shows up in the + # PR's "Checks" tab while it executes — issue_comment-triggered runs + # are otherwise associated with the default branch and don't surface + # on the PR. Finalized at end of job with the actual conclusion. + - name: Open in-progress check on PR head + if: github.event_name == 'issue_comment' + id: check_create + uses: actions/github-script@v7 + with: + script: | + const res = await github.rest.checks.create({ + owner: context.repo.owner, + repo: context.repo.repo, + name: 'System Tests', + head_sha: '${{ steps.pr.outputs.head_sha }}', + status: 'in_progress', + details_url: `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, + }); + core.setOutput('id', res.data.id); + + - name: Checkout + uses: actions/checkout@v4 + with: + # For issue_comment we must check out the PR head explicitly — + # GITHUB_SHA points at the default branch for that event. For + # workflow_dispatch and pull_request, empty string lets checkout + # use its default (the dispatched ref / the PR merge commit). + ref: ${{ github.event_name == 'issue_comment' && steps.pr.outputs.head_sha || '' }} + submodules: recursive + + - name: Create Isaac Sim omni_pass.env + run: | + mkdir -p simulation/isaac-sim/docker + cat > simulation/isaac-sim/docker/omni_pass.env <<'EOF' + OMNI_USER=guest + OMNI_PASS=guest + OMNI_SERVER="omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1" + ACCEPT_EULA=Y + OMNI_ENV_PRIVACY_CONSENT=Y + EOF + + - name: Install test dependencies + # Ubuntu 24.04 marks the system Python as externally-managed (PEP 668), + # so `pip install` outside a venv is rejected. Use a venv and prepend + # its bin/ to $GITHUB_PATH so subsequent steps pick up `pytest` + # automatically. + run: | + sudo apt-get update -qq + sudo apt-get install -y --no-install-recommends python3-venv + python3 -m venv .venv + echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH" + .venv/bin/pip install --upgrade pip + .venv/bin/pip install -r tests/requirements.txt + + # Optional registry-cache mode. When the secrets/vars are present we log + # in to the internal Docker registry; the next step then sets + # AIRSTACK_REGISTRY_CACHE=1 so airstack.sh pre-pulls + uses BuildKit + # inline cache (build_docker tests get layer-reuse speedup) and pre-pulls + # before `airstack up` (other tests skip the implicit rebuild). When + # secrets are absent both steps are skipped and behavior is unchanged. + - name: Log in to internal Docker registry + id: docker_login + if: ${{ vars.DOCKER_REGISTRY_URL != '' && env.DOCKER_REGISTRY_PASSWORD != '' }} + uses: docker/login-action@v3 + with: + registry: ${{ vars.DOCKER_REGISTRY_URL }} + username: ${{ vars.DOCKER_REGISTRY_USERNAME }} + password: ${{ secrets.DOCKER_REGISTRY_PASSWORD }} + + - name: Enable registry-cache mode + if: ${{ steps.docker_login.outcome == 'success' }} + run: echo "AIRSTACK_REGISTRY_CACHE=1" >> "$GITHUB_ENV" + + - name: Ensure airstack.sh is executable + run: chmod +x airstack.sh + + # The ephemeral runner starts with no local images. `airstack_env` in + # tests/conftest.py fails fast if compose images are missing, so prep + # them here. Profile-gated services (ms-airsim, isaac-sim) are skipped + # by compose unless their profile is active, so we mirror the fixture's + # profile selection from the parsed --sim. Pull-only by default; fall + # back to a full build only if the registry doesn't have everything + # (e.g. new branch with no published image yet). Skipped when the + # marks expression contains build_docker — those tests build per-service + # themselves. + - name: Ensure Docker images present + if: ${{ steps.parse.outputs.skip_image_prep != 'true' }} + env: + AIRSTACK_ROOT: ${{ github.workspace }} + SIM_INPUT: ${{ steps.parse.outputs.sim }} + run: | + profiles=desktop + [[ ",$SIM_INPUT," == *,msairsim,* ]] && profiles="$profiles,ms-airsim" + [[ ",$SIM_INPUT," == *,isaacsim,* ]] && profiles="$profiles,isaac-sim" + export COMPOSE_PROFILES="$profiles" + echo "Pulling images for COMPOSE_PROFILES=$COMPOSE_PROFILES" + + # Pull from registry; tolerate per-image failures so we can detect + # what's still missing afterwards instead of aborting on the first + # gap. `--progress=quiet` suppresses per-layer progress; errors + # still surface on stderr. + ./airstack.sh --progress=quiet image-pull --ignore-pull-failures || true + + missing=() + while IFS= read -r img; do + [[ -z "$img" ]] && continue + if ! docker image inspect "$img" --format '{{.Id}}' >/dev/null 2>&1; then + missing+=("$img") + fi + done < <(docker compose -f docker-compose.yaml config --images) + + if (( ${#missing[@]} > 0 )); then + echo "Pull did not produce these images; falling back to build:" + printf ' - %s\n' "${missing[@]}" + ./airstack.sh --progress=quiet image-build + else + echo "All required images present after pull — skipping build." + fi + + - name: Run tests + env: + AIRSTACK_ROOT: ${{ github.workspace }} + DISPLAY: "" + PYTEST_ARGS: ${{ steps.parse.outputs.pytest_args }} + run: | + # Re-split the shell-quoted args from the parse step so we forward + # them to pytest as a proper argv list (preserving values like + # `-m 'a or b'`). Empty PYTEST_ARGS yields an empty array, so + # pytest falls back to its conftest defaults. + mapfile -t ARGS < <(python3 -c "import os, shlex; print('\n'.join(shlex.split(os.environ['PYTEST_ARGS'])))") + pytest tests/ \ + "${ARGS[@]}" \ + -v -s \ + --log-cli-level=INFO \ + --log-cli-format='%(asctime)s [%(levelname)s] %(name)s: %(message)s' \ + --log-cli-date-format='%H:%M:%S' + + - name: Upload test results + uses: actions/upload-artifact@v4 + if: always() + with: + name: test-results-${{ github.sha }}-${{ github.run_id }} + path: tests/results/ + retention-days: 90 + + # Close out the Check Run with the job's final conclusion. The + # `steps.check_create.outputs.id` guard skips this when the open + # step didn't run (workflow_dispatch) or failed before producing + # an id. + - name: Finalize check on PR head + if: always() && github.event_name == 'issue_comment' && steps.check_create.outputs.id + uses: actions/github-script@v7 + with: + script: | + await github.rest.checks.update({ + owner: context.repo.owner, + repo: context.repo.repo, + check_run_id: ${{ steps.check_create.outputs.id }}, + status: 'completed', + conclusion: '${{ job.status }}', + details_url: `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, + }); + + report: + name: Metrics Report + runs-on: ubuntu-latest + needs: run-tests + # Skip when run-tests was skipped (e.g., comment didn't match `/pytest`) + # so we don't post empty-report comments on every PR comment. + if: always() && needs.run-tests.result != 'skipped' + permissions: + pull-requests: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install report dependencies + run: pip install tabulate + + - name: Resolve PR base branch + if: github.event_name == 'issue_comment' || github.event_name == 'pull_request' + id: pr_ctx + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "base_ref=${{ github.base_ref }}" >> "$GITHUB_OUTPUT" + else + BASE=$(gh api repos/${{ github.repository }}/pulls/${{ github.event.issue.number }} --jq .base.ref) + echo "base_ref=$BASE" >> "$GITHUB_OUTPUT" + fi + + - name: Download current test results + uses: actions/download-artifact@v4 + with: + name: test-results-${{ github.sha }}-${{ github.run_id }} + path: current-results/ + + # PR mode (opened or comment-triggered): fetch latest artifact from + # the PR's base branch (e.g. develop or main). + - name: Download baseline results (PR) + if: github.event_name == 'issue_comment' || github.event_name == 'pull_request' + uses: dawidd6/action-download-artifact@v6 + continue-on-error: true + with: + workflow: system-tests.yml + branch: ${{ steps.pr_ctx.outputs.base_ref }} + name_is_regexp: true + name: "test-results-.*" + path: baseline-results/ + if_no_artifact_found: warn + + # Manual dispatch with explicit baseline run ID + - name: Download baseline results (manual, explicit run ID) + if: > + github.event_name == 'workflow_dispatch' && + inputs.baseline_run_id != '' + uses: actions/download-artifact@v4 + continue-on-error: true + with: + run-id: ${{ inputs.baseline_run_id }} + name_is_regexp: true + name: "test-results-.*" + path: baseline-results/ + + # Manual dispatch without explicit baseline: fetch latest from main + - name: Download baseline results (manual, latest main) + if: > + github.event_name == 'workflow_dispatch' && + inputs.baseline_run_id == '' + uses: dawidd6/action-download-artifact@v6 + continue-on-error: true + with: + workflow: system-tests.yml + branch: main + name_is_regexp: true + name: "test-results-.*" + path: baseline-results/ + if_no_artifact_found: warn + + - name: Locate result directories + id: dirs + # Find the dir holding results.xml. Nesting depth differs by downloader: + # actions/download-artifact@v4 (single name) extracts straight into the + # path, while dawidd6/action-download-artifact@v6 with name_is_regexp + # wraps each artifact in a subdir named after it. `find` handles both. + run: | + CURRENT_XML=$(find current-results/ -name results.xml 2>/dev/null | sort -r | head -1) + [ -n "$CURRENT_XML" ] && echo "current=$(dirname "$CURRENT_XML")" >> "$GITHUB_OUTPUT" + + BASELINE_XML=$(find baseline-results/ -name results.xml 2>/dev/null | sort -r | head -1) + if [ -n "$BASELINE_XML" ]; then + echo "baseline=$(dirname "$BASELINE_XML")" >> "$GITHUB_OUTPUT" + else + echo "baseline=" >> "$GITHUB_OUTPUT" + fi + + - name: Generate metrics report + id: report + continue-on-error: true + run: | + CURRENT="${{ steps.dirs.outputs.current }}" + BASELINE="${{ steps.dirs.outputs.baseline }}" + + if [ -n "$BASELINE" ]; then + python tests/parse_metrics.py \ + --current "$CURRENT" \ + --baseline "$BASELINE" \ + --output report.md + else + python tests/parse_metrics.py \ + --current "$CURRENT" \ + --output report.md + fi + + - name: Post PR comment + if: github.event_name == 'issue_comment' || github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + let body; + try { + body = fs.readFileSync('report.md', 'utf8'); + } catch { + body = '_No metrics report generated._'; + } + const header = `## Test Metrics — \`${{ github.sha }}\`\n\n`; + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: header + body, + }); + + - name: Write job summary + if: always() + run: | + if [ -f report.md ]; then + echo "## Test Metrics — \`${{ github.sha }}\`" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + cat report.md >> "$GITHUB_STEP_SUMMARY" + else + echo "_No metrics report generated._" >> "$GITHUB_STEP_SUMMARY" + fi + + - name: Fail on regression + if: steps.report.outcome == 'failure' + run: | + echo "::error::Metric regression detected — see the report above for details." + exit 1 diff --git a/.gitignore b/.gitignore index afd16bcf4..4868b5c74 100644 --- a/.gitignore +++ b/.gitignore @@ -76,9 +76,28 @@ simulation/isaac-sim/launch_scripts/prepare_scene.py # Generated Microsoft AirSim (legacy) config simulation/ms-airsim/config/settings.json +# scenes and raven-specific launch scripts +scenes/ +simulation/isaac-sim/launch_scripts/AbandonedFactory_Launch.py +simulation/isaac-sim/launch_scripts/ConstructionSite_Launch.py +simulation/isaac-sim/launch_scripts/FireAcademy_Launch.py +simulation/isaac-sim/launch_scripts/RetroNeighborhood_Launch.py + +# Persisted waypoint/polygon editor saves (host-side mount target) +gcs/saves/* +!gcs/saves/.gitkeep +gcs/foxglove_extensions/airstack_layout_custom.json # Downloaded UE4 scene binaries (fetched via assets/scenes/fetch_scene.sh) simulation/ms-airsim/assets/scenes/* !simulation/ms-airsim/assets/scenes/fetch_scene.sh # Test results tests/results/ + +# Local-only — embedded sibling repo, not part of this branch +common/rayfronts/ + +# Docker build cache (root-owned subdirs cause permission warnings on `git add`) +robot/docker/cache/ +.DS_Store +gcs/.DS_Store diff --git a/.gitmodules b/.gitmodules index 4314795a4..8722432a8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -17,3 +17,4 @@ [submodule "common/ros_packages/gui/rviz/rviz_polygon_selection_tool"] path = common/ros_packages/gui/rviz/rviz_polygon_selection_tool url = https://github.com/swri-robotics/rviz_polygon_selection_tool.git + diff --git a/AGENTS.md b/AGENTS.md index c3852bfad..f9eed977c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -40,6 +40,10 @@ AirStack/ ├── common/ # Shared packages & utilities ├── docs/ # MkDocs documentation ├── mkdocs.yml # MkDocs config file +├── tests/ # System tests (pytest) + metrics reporting +├── .github/ +│ ├── workflows/ # GitHub Actions CI (system-tests, docker-build, etc.) +│ └── orchestrator/ # OpenStack-backed ephemeral self-hosted runners └── .agents/skills/ # Detailed workflow guides for agents ``` @@ -82,11 +86,19 @@ For detailed step-by-step instructions, refer to the **`.agents/skills/`** direc | [add-ros2-package](.agents/skills/add-ros2-package) | Creating a new algorithm module package | | [add-task-executor](.agents/skills/add-task-executor) | Implementing a task executor as a ROS 2 action server | | [integrate-module-into-layer](.agents/skills/integrate-module-into-layer) | Adding module to layer bringup | +| [write-launch-file](.agents/skills/write-launch-file) | Authoring ROS 2 launch files with AirStack conventions (ROBOT_NAME namespacing, topic remapping, allow_substs) | | [write-isaac-sim-scene](.agents/skills/write-isaac-sim-scene) | Creating custom simulation scenes | +| [visualize-in-foxglove](.agents/skills/visualize-in-foxglove) | Adding topic visualization to Foxglove/GCS | +| [attach-gossip-payload](.agents/skills/attach-gossip-payload) | Broadcasting custom ROS messages to peers via PeerProfile gossip payloads | | [debug-module](.agents/skills/debug-module) | Autonomous debugging of ROS 2 modules | | [update-documentation](.agents/skills/update-documentation) | Documenting new modules and updating mkdocs | -| [test-in-simulation](.agents/skills/test-in-simulation) | End-to-end simulation testing | +| [test-in-simulation](.agents/skills/test-in-simulation) | End-to-end simulation testing of a module | +| [run-system-tests](.agents/skills/run-system-tests) | Running the pytest system test harness (marks, MetricsRecorder, /pytest PR trigger) | | [add-behavior-tree-node](.agents/skills/add-behavior-tree-node) | Creating behavior tree nodes | +| [use-airstack-cli](.agents/skills/use-airstack-cli) | Using the `airstack` CLI and the non-interactive `docker exec` pattern | +| [configure-multi-robot](.agents/skills/configure-multi-robot) | Setting up multiple robots, ROBOT_NAME namespacing, and ROS_DOMAIN_ID isolation | +| [bump-version-and-release](.agents/skills/bump-version-and-release) | Bumping `.env` VERSION and CHANGELOG before merge to clear the version-check gate | +| [capture-discovered-knowledge](.agents/skills/capture-discovered-knowledge) | After long context-discovery / surprising findings, persist to AGENTS.md or a new skill so the next agent doesn't redo the work | **Agent Workflow Example:** 1. Study reference implementation for module type @@ -188,6 +200,35 @@ docker exec airstack-robot-desktop-1 bash -c "ros2 topic echo --onc - End-to-end autonomy stack testing - Real sensor simulation - Multi-robot scenarios + - Implemented in [`tests/`](tests/) — see below + +### System Test Suite (`tests/`) + +Pytest-based system tests live at the repo root in [`tests/`](tests/). They bring up the full Docker stack (sim + robot + GCS) and verify container health, ROS 2 node presence, compute usage, sensor topic streams (``sensors`` mark), and end-to-end flight behavior. + +| File | Mark | What it tests | Hardware | +|------|------|---------------|----------| +| [`tests/test_build_docker.py`](tests/test_build_docker.py) | `build_docker` | Docker image builds (robot-desktop, gcs, isaac-sim, ms-airsim) | Docker | +| [`tests/test_build_packages.py`](tests/test_build_packages.py) | `build_packages` | `colcon build` inside each container | Docker | +| [`tests/test_liveliness.py`](tests/test_liveliness.py) | `liveliness` | Stack bring-up: containers, ``/clock`` readiness, tmux, sentinel ROS 2 nodes, compute, infra-only stability poll | Docker, GPU, sim license | +| [`tests/test_sensors.py`](tests/test_sensors.py) | `sensors` | Topic Hz (Isaac: batched sim + robot ``ros2 topic hz``; filtered LiDAR ``echo-once`` + validation script), RTF, sensor stability time-series | Docker, GPU, sim license | +| [`tests/test_takeoff_hover_land.py`](tests/test_takeoff_hover_land.py) | `takeoff_hover_land` | 4-phase flight chain (PX4 ready → takeoff → hover → land) per (sim, num_robots, iter, velocity) | Docker, GPU, sim license | + +Shared fixtures, the `airstack_env` parametrized fixture, and `MetricsRecorder` live in [`tests/conftest.py`](tests/conftest.py). Each run produces a timestamped directory under `tests/results//` with `results.xml`, `metrics.json`, and per-test logs. [`tests/parse_metrics.py`](tests/parse_metrics.py) generates a markdown report (single-run or diff-vs-baseline; exits 1 on regression). + +**Run via the CLI** (containerized runner — no local Python needed): + +```bash +airstack test -m "build_docker or build_packages" -v +airstack test -m liveliness --sim msairsim --num-robots 1 --stress-iterations 1 -v +airstack test -m sensors --sim isaacsim --num-robots 1 --stress-iterations 1 -v +airstack test -m takeoff_hover_land --sim msairsim --takeoff-velocities 0.5,1,2 -v +``` + +Full reference: [`tests/README.md`](tests/README.md) — including **liveliness vs +sensors** (infra vs topic streams), **class-scoped `airstack_env`** (two bring-ups +when you select both marks with `and`), and **Isaac Sim** batching of +`ros2 topic hz` plus LiDAR `echo-once` / `ENABLE_LIDAR` for pytest. ### Autonomous Debugging Approach When a module doesn't work: @@ -201,7 +242,31 @@ When a module doesn't work: See detailed debugging workflow: [.agents/skills/debug_module](.agents/skills/debug_module) -**Note:** Full testing infrastructure is a work in progress. Focus on integration tests and simulation validation for now. +## CI/CD + +GitHub Actions workflows live in [`.github/workflows/`](.github/workflows/): + +| Workflow | Trigger | Purpose | +|----------|---------|---------| +| [`system-tests.yml`](.github/workflows/system-tests.yml) | PR opened, `/pytest` PR comment (write-access only), or `workflow_dispatch` | Runs the `tests/` suite on an ephemeral GPU runner; posts metrics report (with regression diff vs base branch / `main`) as a PR comment and to the job summary | +| [`docker-build.yml`](.github/workflows/docker-build.yml) | Push to `main`/`develop` that changes `.env` (`VERSION=`), or manual dispatch | Builds, pushes, and cosign-signs all compose images on the ephemeral runner | +| [`check-version-increment.yml`](.github/workflows/check-version-increment.yml) | Pull request | Validates `.env` `VERSION=` is valid semver and strictly greater than the base branch | +| `deploy_docs_from_{main,develop,release}.yaml` | Push to the matching branch (`docs/**`, `mkdocs.yml`, `*.md`) | Publishes versioned MkDocs site via `mike` | + +**`/pytest` PR comments** trigger `system-tests.yml` for users with write access (OWNER/MEMBER/COLLABORATOR), pulling args from the first line of the comment (e.g. `/pytest -m liveliness --sim msairsim`). Fork PRs are blocked — same-repo only — to keep arbitrary code off the self-hosted runner. + +### Ephemeral Runner Orchestrator + +GPU-required jobs (`runs-on: [self-hosted, airstack-ephemeral]`) execute on **OpenStack VMs spawned per-job and destroyed on completion**. The orchestrator service code lives in [`.github/orchestrator/`](.github/orchestrator/): + +- [`orchestrator.py`](.github/orchestrator/orchestrator.py) — Python service: spawn loop polls GitHub for queued jobs matching configured runner labels, mints single-use JIT runner tokens, creates an OpenStack server with cloud-init bootstrap; reap loop deletes the server when the job completes (or after `max_job_minutes`) +- [`cloud-init.yaml.j2`](.github/orchestrator/cloud-init.yaml.j2) — bootstraps Docker + nvidia-container-toolkit + GH Actions runner on the worker, registers with the JIT token, runs one job, then `shutdown -h` +- [`config.example.yaml`](.github/orchestrator/config.example.yaml) — flavor / network / keypair / floating-IP pool / runner labels / repo +- [`airstack-orchestrator.service`](.github/orchestrator/airstack-orchestrator.service) + [`setup.sh`](.github/orchestrator/setup.sh) — systemd unit and one-time installer + +**Why ephemeral:** clean Docker cache per run, no leaked containers, GitHub PAT and OpenStack credentials only on the orchestrator host (workers receive a single-use JIT token bound to one runner registration). State map at `/var/lib/airstack-orchestrator/state.json`; logs via `journalctl -u airstack-orchestrator.service -f`. + +**Setup, debugging a failed job, and SSH-into-worker procedures:** [`.github/orchestrator/README.md`](.github/orchestrator/README.md) (also exposed as [`tests/ci-cd-orchestrator.md`](tests/ci-cd-orchestrator.md) symlink for the docs site). ## Documentation Requirements @@ -270,11 +335,27 @@ Each major component has its own Docker container: **Configuration:** - Main compose file: `docker-compose.yaml` (includes all component compose files) -- Environment variables: `.env` file (Docker image tags, launch config) -- Robot configuration: Environment variables set in `robot/docker/.env` +- Environment variables: top-level [`.env`](.env) (image tags, `VERSION`, `NUM_ROBOTS`, `ROBOT_NAME_MAP_CONFIG_FILE`, `ISAAC_SIM_SCRIPT_NAME`, `AUTONOMY_ROLE`, etc.) +- Per-container shell init: [`robot/docker/.bashrc`](robot/docker/.bashrc) — resolves `ROBOT_NAME` and `ROS_DOMAIN_ID` at startup (see Multi-Robot Configuration below) **Networking:** Custom bridge network (172.31.0.0/24) for inter-container communication. +## Multi-Robot Configuration + +Multi-robot is implemented via Docker Compose **replicas**, not multiple namespaces in one container. Setting `NUM_ROBOTS=3` in [`.env`](.env) spawns three separate containers (`airstack-robot-desktop-1`, `-2`, `-3`) via `deploy.replicas: ${NUM_ROBOTS:-1}` in [`robot/docker/docker-compose.yaml`](robot/docker/docker-compose.yaml). + +`ROBOT_NAME` is **not** set directly in `.env`. Each container computes it at startup: [`robot/docker/.bashrc`](robot/docker/.bashrc) reads `ROBOT_NAME_SOURCE` (`container_name` or `hostname`) and runs [`resolve_robot_name.py`](robot/docker/robot_name_map/resolve_robot_name.py) against the mapping in [`robot/docker/robot_name_map/`](robot/docker/robot_name_map/) (default: [`default_robot_name_map.yaml`](robot/docker/robot_name_map/default_robot_name_map.yaml)). The resolver exports both `ROBOT_NAME` and `ROS_DOMAIN_ID` — robot N gets domain N by default, so each robot is on its own DDS partition. + +The autonomy bringup variant is selected by `AUTONOMY_ROLE` (`full` | `onboard` | `offboard`), dispatched in [`robot/ros_ws/src/autonomy_bringup/launch/robot.launch.xml`](robot/ros_ws/src/autonomy_bringup/launch/robot.launch.xml): + +- **full** — every autonomy module runs on this machine (sim/dev desktop, autonomous Jetson) +- **onboard** — lite modules only (interface, sensors, perception, local planning, behavior); pairs with **offboard** +- **offboard** — global planning only; runs on GCS paired with onboard robots + +For Isaac Sim, the default `ISAAC_SIM_SCRIPT_NAME=example_one_px4_pegasus_launch_script.py` only spawns a single drone. Multi-robot Isaac Sim requires `ISAAC_SIM_SCRIPT_NAME=example_multi_px4_pegasus_launch_script.py` (the system test harness sets this automatically when `--num-robots > 1`). + +**Full workflow:** [.agents/skills/configure-multi-robot](.agents/skills/configure-multi-robot) + ## Critical Pitfalls to Avoid Common mistakes when adding modules: diff --git a/airstack.sh b/airstack.sh index 9ff7fdcf3..78b475c07 100755 --- a/airstack.sh +++ b/airstack.sh @@ -5,6 +5,47 @@ # This script provides a unified interface for common development tasks # in the AirStack project, including setup, installation, and container management. +# Re-exec under bash 4+ if necessary. macOS ships bash 3.2 which can't handle +# `declare -A` (associative arrays) used throughout this script. Searches for +# a newer bash via $AIRSTACK_BASH, then common Homebrew install paths, then +# any `bash` on PATH that reports version >= 4. Sets AIRSTACK_REEXEC_BASH=1 +# to guard against infinite re-exec loops. +if [ -z "${AIRSTACK_REEXEC_BASH:-}" ] && [ "${BASH_VERSINFO[0]:-0}" -lt 4 ]; then + _airstack_candidates=( + "${AIRSTACK_BASH:-}" + /opt/homebrew/bin/bash # Apple Silicon Homebrew + /usr/local/bin/bash # Intel Homebrew + /opt/local/bin/bash # MacPorts + ) + if command -v bash5 >/dev/null 2>&1; then + _airstack_candidates+=("$(command -v bash5)") + fi + # Add any `bash` on PATH whose version is >= 4 (other than the one we just + # got here from, which is < 4 by the if-check above). + for _alt in $(command -v -a bash 2>/dev/null); do + _airstack_candidates+=("$_alt") + done + + for _airstack_alt_bash in "${_airstack_candidates[@]}"; do + [ -z "$_airstack_alt_bash" ] && continue + [ -x "$_airstack_alt_bash" ] || continue + # Probe BASH_VERSINFO[0] without sourcing the script. + if "$_airstack_alt_bash" -c '[ "${BASH_VERSINFO[0]:-0}" -ge 4 ]' 2>/dev/null; then + export AIRSTACK_REEXEC_BASH=1 + exec "$_airstack_alt_bash" "$0" "$@" + fi + done + + cat >&2 <<'EOF' +[ERROR] airstack.sh requires bash 4 or newer (your bash is 3.x). + macOS ships bash 3.2 by default; install a modern bash with: + brew install bash + Or set AIRSTACK_BASH=/path/to/bash >= 4 before invoking this script. +EOF + exit 1 +fi +unset AIRSTACK_REEXEC_BASH + set -e # Script directory @@ -218,8 +259,8 @@ function print_command_help { echo " airstack test -m liveliness --sim msairsim --num-robots 1 \\" echo " --stress-iterations 1 --stable-duration 60 -v" echo "" - echo " # Autonomy run — takeoff/hover/land at 0.5, 1, and 2 m/s" - echo " airstack test -m autonomy --sim msairsim --num-robots 1 \\" + echo " # Takeoff/hover/land run — at 0.5, 1, and 2 m/s" + echo " airstack test -m takeoff_hover_land --sim msairsim --num-robots 1 \\" echo " --stress-iterations 1 --takeoff-velocities 0.5,1,2 -v" ;; docs) @@ -347,7 +388,7 @@ function find_container { read -p "Your selection: " selection <&2 if [ "$selection" = "q" ]; then - log_info "Operation cancelled" >&2 + log_info "Operation canceled" >&2 return 1 elif [[ "$selection" =~ ^[0-9]+$ ]] && [ "$selection" -gt 0 ] && [ "$selection" -le "$match_count" ]; then # Extract just the container name from the selected line @@ -365,7 +406,7 @@ function find_container { read -p "Try again (or 'q' to quit): " selection <&2 if [ "$selection" = "q" ]; then - log_info "Operation cancelled" >&2 + log_info "Operation canceled" >&2 return 1 elif [[ "$selection" =~ ^[0-9]+$ ]] && [ "$selection" -gt 0 ] && [ "$selection" -le "$match_count" ]; then container_name=$(echo "$matches" | sed -n "${selection}p" | awk '{print $1}') @@ -751,6 +792,15 @@ function cmd_up { # Add xhost + to allow GUI applications xhost + &> /dev/null || true + # Registry-cache mode (CI / opt-in): pull existing images first so `up` + # uses the registry copy as-is and skips the implicit rebuild path. No-op + # when AIRSTACK_REGISTRY_CACHE is unset. + if [[ "${AIRSTACK_REGISTRY_CACHE:-}" == "1" ]]; then + log_info "AIRSTACK_REGISTRY_CACHE=1 → pulling images before up..." + run_docker_compose -f "$PROJECT_ROOT/docker-compose.yaml" "${global_args[@]}" pull --ignore-pull-failures "${subcmd_args[@]}" || \ + log_warn "Pre-up pull encountered failures; continuing with whatever is local" + fi + log_info "Starting services..." run_docker_compose -f "$PROJECT_ROOT/docker-compose.yaml" "${global_args[@]}" up "${subcmd_args[@]}" -d log_info "Services brought up successfully" @@ -763,8 +813,26 @@ function cmd_image_build { local subcmd_args=() classify_compose_args global_args subcmd_args "$@" - log_info "Building services..." - run_docker_compose -f "$PROJECT_ROOT/docker-compose.yaml" "${global_args[@]}" build "${subcmd_args[@]}" + # Registry-cache mode (CI / opt-in): pre-pull existing images to seed the + # local cache, build with BUILDKIT_INLINE_CACHE=1 so the resulting image + # carries layer-cache metadata, and push so the next run benefits. The + # cache_from declarations in each component compose file make BuildKit + # actually reuse the pulled layers. No-op when the env var is unset. + if [[ "${AIRSTACK_REGISTRY_CACHE:-}" == "1" ]]; then + log_info "AIRSTACK_REGISTRY_CACHE=1 → pulling for cache seed..." + run_docker_compose -f "$PROJECT_ROOT/docker-compose.yaml" "${global_args[@]}" pull --ignore-pull-failures "${subcmd_args[@]}" || \ + log_warn "Pre-build pull encountered failures; continuing without cache seed" + + log_info "Building services with BUILDKIT_INLINE_CACHE=1..." + run_docker_compose -f "$PROJECT_ROOT/docker-compose.yaml" "${global_args[@]}" build --build-arg BUILDKIT_INLINE_CACHE=1 "${subcmd_args[@]}" + + log_info "Pushing built images for next-run cache..." + run_docker_compose -f "$PROJECT_ROOT/docker-compose.yaml" "${global_args[@]}" push --ignore-push-failures "${subcmd_args[@]}" || \ + log_warn "Post-build push encountered failures; future runs may not benefit from cache" + else + log_info "Building services..." + run_docker_compose -f "$PROJECT_ROOT/docker-compose.yaml" "${global_args[@]}" build "${subcmd_args[@]}" + fi log_info "Build completed successfully" } diff --git a/common/ros_packages/coordination/README.md b/common/ros_packages/coordination/README.md new file mode 100644 index 000000000..57a1c28fb --- /dev/null +++ b/common/ros_packages/coordination/README.md @@ -0,0 +1,76 @@ +# Coordination + +Multi-robot coordination layer for AirStack. Implements a gossip protocol over a shared DDS domain so drones can share state and payloads without a central broker. + +## Architecture + +``` +Robot (domain N) Shared gossip domain (99) +┌─────────────────────┐ ┌────────────────────────┐ +│ gossip_node │ │ │ +│ ├─ PeerProfile │──DDS Router──▶│ /gossip/peers │◀──▶ GCS +│ │ ├─ GPS/heading │ │ │ +│ │ ├─ waypoint │ └────────────────────────┘ +│ │ └─ payloads[] │ +│ └─ //coordination/peer_registry ◀── per-robot snapshot +└─────────────────────┘ +``` + +Every robot publishes its own `PeerProfile` at 1 Hz on a steady (monotonic) clock and receives profiles from all peers via the shared domain. + +## Packages + +### `coordination_msgs` +Wire-format message definitions: +- `PeerProfile.msg` — robot identity, GPS, heading, waypoint, and a typed payload array +- `PeerProfilePayload.msg` — a single serialized ROS message (`payload_type` string + `payload_data` bytes) + +### `coordination_bringup` +Runtime nodes and configuration: +- **`gossip_node.py`** — publishes own profile, receives peer profiles, maintains registry +- **`peer_profile.py`** — Python helper class for serializing/deserializing `PeerProfile` and its payloads +- **`frame_utils.py`** — GPS ↔ ENU coordinate conversion helpers +- **`config/gossip_payloads.yaml`** — declares which local topics to attach as payloads (config-driven, no code changes) + +## Message Deduplication + +Each message is identified by `(robot_name, stamp.sec, stamp.nanosec)`, where the stamp is set at publish time by the originating robot. Receivers maintain a seen-set (50 entries, FIFO eviction) and drop already-processed IDs. + +Every drone receives a message at least once — this is expected. The seen-set prevents re-processing, not initial fan-out. + +**Relay fields** (`source`, `relay_hops`) exist in the wire format for future multi-hop use but relay logic is not yet active. + +## Adding a Payload + +Edit `coordination_bringup/config/gossip_payloads.yaml`: + +```yaml +payload_topics: + - topic: "/{robot_name}/your/topic" + type: "your_msgs/msg/YourType" +``` + +`{robot_name}` is substituted at runtime. Topics that haven't published yet are silently skipped. + +See [`.agents/skills/attach-gossip-payload`](../../../.agents/skills/attach-gossip-payload/SKILL.md) for the full workflow including GCS visualization. + +## Topics + +| Topic | Direction | QoS | Purpose | +|-------|-----------|-----|---------| +| `/gossip/peers` | pub/sub | BEST_EFFORT | Shared profile bus across all robots and GCS | +| `/{robot_name}/coordination/peer_registry` | pub | RELIABLE, TRANSIENT_LOCAL | Snapshot of all known peers (latest-wins) | + +## Key Parameters (`gossip_node`) + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `robot_name` | — | Robot identifier, also used as topic namespace | +| `publish_rate` | `1.0` | Hz, wall-clock (fires even when sim time is paused) | +| `gossip_domain` | `99` | Shared DDS domain for the gossip bus | + +## Future Plans + +- **Payload version hashing** — hash `payload_data` bytes and skip re-sending unchanged payloads (e.g. static voxel maps). Reduces gossip bandwidth by up to 90% for slow-changing payloads like PointCloud2 maps. + +- **OLSR Multipoint Relay (MPR)** — when relay forwarding (`SOURCE_RELAYED`, `relay_hops`) is activated, use OLSR MPR selection to elect the minimal set of relay nodes that cover all 2-hop neighbors. Prevents O(n²) message explosion from naive flooding in partial-mesh / long-range deployments. diff --git a/common/ros_packages/coordination/coordination_bringup/config/gcs_gossip_dds_router.yaml b/common/ros_packages/coordination/coordination_bringup/config/gcs_gossip_dds_router.yaml new file mode 100644 index 000000000..0b9b80a00 --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/config/gcs_gossip_dds_router.yaml @@ -0,0 +1,17 @@ +# GCS-side gossip DDS Router configuration +# Bridges /gossip/peers between the GCS domain (0) and the shared gossip +# domain (99) so the GCS can receive PeerProfile broadcasts from all robots. +# +# This runs in the GCS container. Each robot runs its own gossip_dds_router +# (robot domain ↔ domain 99). This is the GCS counterpart. + +participants: + - name: "gcs" + kind: "local" + domain: 0 # GCS ROS_DOMAIN_ID + - name: "gossip_bus" + kind: "local" + domain: 99 # shared gossip domain + +allowlist: + - name: "rt/gossip/peers" diff --git a/common/ros_packages/coordination/coordination_bringup/config/gossip_dds_router.yaml b/common/ros_packages/coordination/coordination_bringup/config/gossip_dds_router.yaml new file mode 100644 index 000000000..4da27561d --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/config/gossip_dds_router.yaml @@ -0,0 +1,18 @@ +# Gossip DDS Router configuration +# Bridges /gossip/peers between the robot's own ROS_DOMAIN_ID and the +# shared gossip domain so all robots can see each other's PeerProfile +# broadcasts without any per-robot enumeration. +# +# Every robot runs this identical config. Adding a new robot only +# requires incrementing NUM_ROBOTS – no changes here needed. + +participants: + - name: "robot" + kind: "local" + domain: $(env ROS_DOMAIN_ID) + - name: "gossip_bus" + kind: "local" + domain: $(var gossip_domain) + +allowlist: + - name: "rt/gossip/peers" diff --git a/common/ros_packages/coordination/coordination_bringup/config/gossip_payloads.yaml b/common/ros_packages/coordination/coordination_bringup/config/gossip_payloads.yaml new file mode 100644 index 000000000..2db96985a --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/config/gossip_payloads.yaml @@ -0,0 +1,31 @@ +# ───────────────────────────────────────────────────────────────────────────── +# Gossip Payload Topics +# ───────────────────────────────────────────────────────────────────────────── +# List ROS topics here to have them automatically included as typed payloads +# in this robot's PeerProfile and distributed to all peers via gossip. +# +# Each entry requires: +# topic: topic name — use {robot_name} as a placeholder for the robot namespace +# type: fully-qualified ROS message type (package/msg/Type) +# +# If a topic has never published, the payload is silently skipped — the node +# will not crash or block waiting for it. +# +# ── How to add your own payload ────────────────────────────────────────────── +# 1. Add an entry below with the topic + type. +# 2. On the receiving end, call peer_profile.get_payload("") to get the +# deserialized message, or peer_profile.payload_types() to list what's there. +# +# Example (receiving side in Python): +# from coordination_bringup.peer_profile import PeerProfile +# profile = PeerProfile.from_ros_msg(peer_msg) +# rays = profile.get_payload("visualization_msgs/msg/MarkerArray") +# if rays is not None: +# # use rays as a visualization_msgs/msg/MarkerArray +# ───────────────────────────────────────────────────────────────────────────── + +payload_topics: + + # ── Add payloads below ───────────────────────────────────────────────────── + # - topic: "/{robot_name}/your/topic" + # type: "your_msgs/msg/YourType" diff --git a/common/ros_packages/coordination/coordination_bringup/coordination_bringup/__init__.py b/common/ros_packages/coordination/coordination_bringup/coordination_bringup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/common/ros_packages/coordination/coordination_bringup/coordination_bringup/frame_utils.py b/common/ros_packages/coordination/coordination_bringup/coordination_bringup/frame_utils.py new file mode 100644 index 000000000..f2e77619f --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/coordination_bringup/frame_utils.py @@ -0,0 +1,97 @@ +"""Coordinate frame utilities shared between gossip_node (robot) and gcs_visualizer (GCS).""" + +import copy +import math +import struct + +# Default world origin — Lisbon (matches gcs_utils.py and gps_utils.py) +DEFAULT_ORIGIN_LAT = 38.736832 +DEFAULT_ORIGIN_LON = -9.137977 +DEFAULT_ORIGIN_ALT = 90.0 + + +def gps_to_enu(lat, lon, alt, + origin_lat=DEFAULT_ORIGIN_LAT, + origin_lon=DEFAULT_ORIGIN_LON, + origin_alt=DEFAULT_ORIGIN_ALT): + """Convert GPS lat/lon/alt to ENU meters relative to the world origin.""" + x = (lon - origin_lon) * 111320.0 * math.cos(math.radians(origin_lat)) + y = (lat - origin_lat) * 111320.0 + z = alt - origin_alt + return x, y, z + + +def heading_to_quat(heading_deg): + """Compass heading (degrees CW from North) → ENU yaw quaternion (x,y,z,w). + + ENU: yaw=0 → East (+x). heading=0 (North) → yaw=90° → q=(0,0,sin45,cos45). + """ + yaw_enu = math.radians(90.0 - heading_deg) + return (0.0, 0.0, math.sin(yaw_enu / 2.0), math.cos(yaw_enu / 2.0)) + + +def rotate_vector(v, q): + """Rotate vector v=(vx,vy,vz) by quaternion q=(x,y,z,w). Returns (x,y,z).""" + vx, vy, vz = v + qx, qy, qz, qw = q + cx = qy * vz - qz * vy + cy = qz * vx - qx * vz + cz = qx * vy - qy * vx + return ( + vx + 2.0 * (qw * cx + qy * cz - qz * cy), + vy + 2.0 * (qw * cy + qz * cx - qx * cz), + vz + 2.0 * (qw * cz + qx * cy - qy * cx), + ) + + +def transform_marker_array(marker_array, bx, by, bz, q=(0.0, 0.0, 0.0, 1.0)): + """Deep-copy a MarkerArray and transform all points[]: p_map = R(q)*p + (bx,by,bz). + + Transforms points[] only, not pose.position — LINE_STRIP/POINTS markers store + geometry in points[] with an identity pose, so translating pose.position would + double-apply the offset. Sets frame_id='map'. Returns a new MarkerArray. + """ + from visualization_msgs.msg import MarkerArray as MA + out = MA() + for orig in marker_array.markers: + m = copy.deepcopy(orig) + m.header.frame_id = 'map' + for pt in m.points: + rx, ry, rz = rotate_vector((pt.x, pt.y, pt.z), q) + pt.x = rx + bx + pt.y = ry + by + pt.z = rz + bz + out.markers.append(m) + return out + + +def transform_point_cloud2(cloud, bx, by, bz, q=(0.0, 0.0, 0.0, 1.0)): + """Return a copy of PointCloud2 with all xyz transformed: p_map = R(q)*p + (bx,by,bz). + + Sets frame_id='map'. Reads field offsets from the message header. + """ + field_offsets = {f.name: f.offset for f in cloud.fields if f.name in ('x', 'y', 'z')} + if not all(k in field_offsets for k in ('x', 'y', 'z')): + return cloud + + ox, oy, oz = field_offsets['x'], field_offsets['y'], field_offsets['z'] + ps = cloud.point_step + n_points = cloud.width * cloud.height + if ps == 0 or len(cloud.data) < n_points * ps: + return cloud # malformed cloud — skip rather than raise + + new_cloud = copy.copy(cloud) + new_cloud.header = copy.copy(cloud.header) + new_cloud.header.frame_id = 'map' + data = bytearray(cloud.data) + for i in range(n_points): + base = i * ps + x, = struct.unpack_from(' None: + """Parse gossip_payloads.yaml and subscribe to each listed topic. + + Topics that haven't published yet simply contribute no payload on that tick. + """ + try: + with open(config_path, "r") as f: + cfg = yaml.safe_load(f) + except Exception as e: + self.get_logger().warn(f"Could not load payload config '{config_path}': {e}") + return + + for entry in cfg.get("payload_topics") or []: + topic_template = entry.get("topic", "") + type_str = entry.get("type", "") + if not topic_template or not type_str: + continue + + topic = topic_template.replace("{robot_name}", self._robot_name) + # Use the last path segment as a short human-readable name (e.g. 'filtered_rays') + self._payload_names[topic] = topic_template.rstrip("/").split("/")[-1] + + try: + msg_class = rosidl_utils.get_message(type_str) + except Exception as e: + self.get_logger().warn(f"Unknown payload type '{type_str}': {e}") + continue + + self._payload_cache[topic] = None + + def _make_callback(t): + def cb(msg): + stamp = getattr(getattr(msg, 'header', None), 'stamp', None) + if stamp is None: + stamp = self.get_clock().now().to_msg() + self._payload_cache[t] = (msg, stamp) + return cb + + sub = self.create_subscription(msg_class, topic, _make_callback(topic), SENSOR_QOS) + self._payload_subs.append(sub) + self.get_logger().info(f"Payload subscription: {topic} ({type_str})") + + def _on_navsat(self, msg: NavSatFix) -> None: + if msg.status.status < 0: # ignore NO_FIX so GPS never zeros out + return + self._profile.set_gps_from_navsat(msg) + if self._boot_pos is None: + self._boot_pos = gps_to_enu(msg.latitude, msg.longitude, msg.altitude) + if self._boot_quat is None and self._profile.heading != 0.0: + self._boot_quat = heading_to_quat(self._profile.heading) + + def _on_compass(self, msg: Float64) -> None: + self._profile.set_heading(msg.data) + if self._boot_pos is not None and self._boot_quat is None: + self._boot_quat = heading_to_quat(msg.data) + + def _on_global_plan(self, msg: Path) -> None: + self._profile.set_waypoint_from_path(msg) + + def _on_peer_msg(self, msg: PeerProfileMsg) -> None: + if msg.robot_name == self._robot_name: + return # discard own messages echoed back from the gossip domain + + msg_id = (msg.robot_name, + msg.gps_fix.header.stamp.sec, + msg.gps_fix.header.stamp.nanosec) + if msg_id in self._seen: + return + self._seen[msg_id] = None + if len(self._seen) > _GOSSIP_SEEN_SIZE: + self._seen.popitem(last=False) + + new_t = (msg.gps_fix.header.stamp.sec + + msg.gps_fix.header.stamp.nanosec * 1e-9) + with self._peer_inbox_lock: + existing = self._peer_inbox.get(msg.robot_name) + if existing is not None: + old_t = (existing.gps_fix.header.stamp.sec + + existing.gps_fix.header.stamp.nanosec * 1e-9) + if new_t < old_t: + return + self._peer_inbox[msg.robot_name] = msg + + def _drain_peer_inbox(self) -> None: + with self._peer_inbox_lock: + inbox = dict(self._peer_inbox) + self._peer_inbox.clear() + for msg in inbox.values(): + self._update_registry(msg) + + def _publish_tick(self) -> None: + self._publish_own() + + def _publish_own(self) -> None: + self._profile.clear_payloads() + if self._boot_pos is not None: + bx, by, bz = self._boot_pos + # PX4/MAVROS odom frame is ENU-aligned regardless of drone heading — + # only translation is needed; rotation would incorrectly rotate payloads. + q = (0.0, 0.0, 0.0, 1.0) + for topic, entry in self._payload_cache.items(): + if entry is not None: + msg, stamp = entry + transformed = self._transform_to_global(msg, bx, by, bz, q) + self._profile.add_payload(transformed, stamp=stamp, name=self._payload_names.get(topic, "")) + + # Steady clock so the dedup-by-stamp invariant on receivers survives /clock pauses. + self._profile.gps_fix.header.stamp = self._steady_clock.now().to_msg() + self._gossip_pub.publish(self._profile.to_ros_msg()) + + def _transform_to_global(self, msg, bx, by, bz, q): + if isinstance(msg, MarkerArray): + return transform_marker_array(msg, bx, by, bz, q) + if isinstance(msg, PointCloud2): + return transform_point_cloud2(msg, bx, by, bz, q) + return msg # unknown type — pass through untransformed + + def _update_registry(self, msg: PeerProfileMsg) -> None: + """Accept msg only if newer than what we have; republish updated snapshot.""" + new_t = (msg.gps_fix.header.stamp.sec + + msg.gps_fix.header.stamp.nanosec * 1e-9) + + with self._registry_lock: + existing = self._registry.get(msg.robot_name) + if existing is not None: + old_t = (existing.gps_fix.header.stamp.sec + + existing.gps_fix.header.stamp.nanosec * 1e-9) + if new_t < old_t: + return + self._registry[msg.robot_name] = msg + + self._registry_pub.publish(msg) + + +def main(args=None): + rclpy.init(args=args) + node = GossipNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/common/ros_packages/coordination/coordination_bringup/coordination_bringup/peer_profile.py b/common/ros_packages/coordination/coordination_bringup/coordination_bringup/peer_profile.py new file mode 100644 index 000000000..5f9cd11c7 --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/coordination_bringup/peer_profile.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import IntEnum +from typing import Any, Dict, List, Optional + +from rclpy.serialization import deserialize_message, serialize_message +import rosidl_runtime_py.utilities as rosidl_utils + +from geometry_msgs.msg import PoseStamped +from nav_msgs.msg import Path +from sensor_msgs.msg import NavSatFix + +from coordination_msgs.msg import PeerProfile as PeerProfileMsg +from coordination_msgs.msg import PeerProfilePayload as PeerProfilePayloadMsg + + +class Source(IntEnum): + DIRECT = 0 + RELAYED = 1 + + +@dataclass +class PeerProfile: + """Base peer state broadcast over the gossip bus.""" + + robot_name: str + gps_fix: NavSatFix = field(default_factory=NavSatFix) + heading: float = 0.0 # degrees clockwise from North (0-360) + waypoint: PoseStamped = field(default_factory=PoseStamped) + source: Source = Source.DIRECT + relay_hops: int = 0 + + _payloads: List[Dict[str, Any]] = field(default_factory=list, repr=False) + + def set_gps_from_navsat(self, msg: NavSatFix) -> None: + self.gps_fix = msg + + def set_heading(self, degrees: float) -> None: + self.heading = float(degrees) + + def set_waypoint_from_path(self, path: Optional[Path]) -> None: + """Extract goal (last pose) from a Path. None or empty path sets waypoint to all-zeros (no plan).""" + if path is not None and len(path.poses) > 0: + self.waypoint = path.poses[-1] + else: + self.waypoint = PoseStamped() + + def has_waypoint(self) -> bool: + s = self.waypoint.header.stamp + return s.sec != 0 or s.nanosec != 0 + + def add_payload(self, msg: Any, stamp=None, name: str = "") -> None: + """Serialize and attach a ROS message as a payload.""" + from builtin_interfaces.msg import Time + type_str = _ros_type_string(msg) + self._payloads.append({ + "name": name, + "type": type_str, + "data": serialize_message(msg), + "stamp": stamp if stamp is not None else Time(), + }) + + def clear_payloads(self) -> None: + self._payloads.clear() + + def get_payload(self, payload_type: str) -> Optional[Any]: + """Return the first payload matching payload_type (e.g. 'nav_msgs/msg/OccupancyGrid'), or None.""" + for p in self._payloads: + if p["type"] == payload_type: + msg_class = rosidl_utils.get_message(payload_type) + return deserialize_message(p["data"], msg_class) + return None + + def get_payload_by_name(self, name: str) -> Optional[Any]: + """Return the payload with the given name, or None.""" + for p in self._payloads: + if p.get("name") == name: + msg_class = rosidl_utils.get_message(p["type"]) + return deserialize_message(p["data"], msg_class) + return None + + def get_payload_with_stamp(self, payload_type: str): + """Like get_payload() but returns (msg, stamp). Returns (None, None) if not found.""" + for p in self._payloads: + if p["type"] == payload_type: + msg_class = rosidl_utils.get_message(payload_type) + return deserialize_message(p["data"], msg_class), p.get("stamp") + return None, None + + def get_all_payloads(self) -> List[Any]: + result = [] + for p in self._payloads: + msg_class = rosidl_utils.get_message(p["type"]) + result.append(deserialize_message(p["data"], msg_class)) + return result + + def payload_types(self) -> List[str]: + return [p["type"] for p in self._payloads] + + def to_ros_msg(self) -> PeerProfileMsg: + msg = PeerProfileMsg() + msg.robot_name = self.robot_name + msg.gps_fix = self.gps_fix + msg.heading = self.heading + msg.waypoint = self.waypoint + msg.source = int(self.source) + msg.relay_hops = self.relay_hops + msg.payloads = [ + PeerProfilePayloadMsg( + stamp=p.get("stamp") or PeerProfilePayloadMsg().stamp, + payload_name=p.get("name", ""), + payload_type=p["type"], + payload_data=list(p["data"]), + ) + for p in self._payloads + ] + return msg + + @classmethod + def from_ros_msg(cls, msg: PeerProfileMsg) -> "PeerProfile": + profile = cls(robot_name=msg.robot_name) + profile.gps_fix = msg.gps_fix + profile.heading = msg.heading + profile.waypoint = msg.waypoint + profile.source = Source(msg.source) + profile.relay_hops = msg.relay_hops + profile._payloads = [ + {"name": p.payload_name, "type": p.payload_type, "data": bytes(p.payload_data), "stamp": p.stamp} + for p in msg.payloads + ] + return profile + + +def _ros_type_string(msg: Any) -> str: + """Return the fully-qualified ROS type string, e.g. 'nav_msgs/msg/OccupancyGrid'.""" + module = type(msg).__module__ + name = type(msg).__name__ + parts = module.split(".") + if len(parts) >= 2: + return f"{parts[0]}/{parts[1]}/{name}" + return f"{module}/{name}" diff --git a/common/ros_packages/coordination/coordination_bringup/coordination_bringup/peer_registry_monitor.py b/common/ros_packages/coordination/coordination_bringup/coordination_bringup/peer_registry_monitor.py new file mode 100644 index 000000000..28ef82a19 --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/coordination_bringup/peer_registry_monitor.py @@ -0,0 +1,184 @@ +""" +peer_registry_monitor.py — CLI diagnostic tool for the gossip peer registry. + +Run on any robot or from a machine joined to domain 99: + ROS_DOMAIN_ID=99 ros2 run coordination_bringup peer_registry_monitor + +Or on a specific robot's domain to see what that robot receives: + ROS_DOMAIN_ID=1 ros2 run coordination_bringup peer_registry_monitor + +Options: + --robot Only show entries for this robot name (partial match) + --rate Refresh rate in Hz (default: 2) +""" + +import argparse +import os +import sys +import threading +import time + +import rclpy +from rclpy.node import Node +from rclpy.qos import QoSHistoryPolicy, QoSProfile, QoSReliabilityPolicy + +from coordination_msgs.msg import PeerProfile as PeerProfileMsg + +GOSSIP_QOS = QoSProfile( + reliability=QoSReliabilityPolicy.BEST_EFFORT, + history=QoSHistoryPolicy.KEEP_LAST, + depth=10, +) + +RESET = "\033[0m" +BOLD = "\033[1m" +CYAN = "\033[36m" +YELLOW = "\033[33m" +GREEN = "\033[32m" +DIM = "\033[2m" + + +def _fmt_gps(gps_fix, heading: float) -> str: + from sensor_msgs.msg import NavSatStatus + status = gps_fix.status.status + status_str = { + NavSatStatus.STATUS_NO_FIX: "NO_FIX", + NavSatStatus.STATUS_FIX: "FIX", + NavSatStatus.STATUS_SBAS_FIX: "SBAS", + NavSatStatus.STATUS_GBAS_FIX: "GBAS", + }.get(status, f"?{status}") + return ( + f"lat={gps_fix.latitude:11.7f} lon={gps_fix.longitude:11.7f} " + f"alt={gps_fix.altitude:7.2f}m hdg={heading:6.1f}° [{status_str}]" + ) + + +def _fmt_waypoint(pose_stamped) -> str: + s = pose_stamped.header.stamp + if s.sec == 0 and s.nanosec == 0: + return f"{DIM}(no plan yet){RESET}" + p = pose_stamped.pose.position + o = pose_stamped.pose.orientation + return f"pos=({p.x:7.2f}, {p.y:7.2f}, {p.z:7.2f}) orient=({o.x:.3f}, {o.y:.3f}, {o.z:.3f}, {o.w:.3f})" + + +def _fmt_stamp(gps_fix) -> str: + s = gps_fix.header.stamp + if s.sec == 0 and s.nanosec == 0: + return "n/a" + t = s.sec + s.nanosec * 1e-9 + return time.strftime("%H:%M:%S", time.localtime(t)) + f".{s.nanosec // 1_000_000:03d}" + + +def _clear(): + sys.stdout.write("\033[2J\033[H") + sys.stdout.flush() + + +class RegistryMonitor(Node): + + def __init__(self, filter_name: str = ""): + super().__init__("peer_registry_monitor") + self._registry: dict[str, PeerProfileMsg] = {} + self._recv_times: dict[str, float] = {} + self._registry_lock = threading.Lock() + self._filter = filter_name.lower() + self._inbox: dict[str, PeerProfileMsg] = {} + self._inbox_lock = threading.Lock() + + self._sub = self.create_subscription( + PeerProfileMsg, "/gossip/peers", self._on_msg, GOSSIP_QOS, + ) + + def _on_msg(self, msg: PeerProfileMsg) -> None: + new_t = (msg.gps_fix.header.stamp.sec + + msg.gps_fix.header.stamp.nanosec * 1e-9) + with self._inbox_lock: + existing = self._inbox.get(msg.robot_name) + if existing is not None: + old_t = (existing.gps_fix.header.stamp.sec + + existing.gps_fix.header.stamp.nanosec * 1e-9) + if new_t < old_t: + self._recv_times[msg.robot_name] = time.time() + return + self._inbox[msg.robot_name] = msg + self._recv_times[msg.robot_name] = time.time() + + def _drain_inbox(self) -> None: + with self._inbox_lock: + inbox = dict(self._inbox) + self._inbox.clear() + for robot_name, msg in inbox.items(): + new_t = (msg.gps_fix.header.stamp.sec + + msg.gps_fix.header.stamp.nanosec * 1e-9) + with self._registry_lock: + existing = self._registry.get(robot_name) + if existing is not None: + old_t = (existing.gps_fix.header.stamp.sec + + existing.gps_fix.header.stamp.nanosec * 1e-9) + if new_t < old_t: + continue + self._registry[robot_name] = msg + + def print_registry(self) -> None: + self._drain_inbox() + _clear() + domain = os.environ.get("ROS_DOMAIN_ID", "?") + now_str = time.strftime("%H:%M:%S") + print(f"{BOLD}Peer Registry {DIM}[domain={domain} {now_str}]{RESET}") + print("─" * 80) + + with self._registry_lock: + entries = sorted(self._registry.values(), key=lambda m: m.robot_name) + recv_times = dict(self._recv_times) + if self._filter: + entries = [e for e in entries if self._filter in e.robot_name.lower()] + + if not entries: + print(f" {DIM}(no peers seen yet){RESET}") + else: + now = time.time() + for msg in entries: + src = "direct" if msg.source == 0 else f"relayed({msg.relay_hops}h)" + payload_summary = ( + f"{len(msg.payloads)} payload(s): " + + ", ".join(p.payload_type for p in msg.payloads) + if msg.payloads + else "no payloads" + ) + recv_t = recv_times.get(msg.robot_name) + age = f"{now - recv_t:.1f}s ago" if recv_t is not None else "?" + recv_wall = time.strftime("%H:%M:%S", time.localtime(recv_t)) if recv_t else "?" + stamp_str = f"{recv_wall} ({age})" + print(f" {CYAN}{BOLD}{msg.robot_name}{RESET} {DIM}[{src} last_recv={stamp_str}]{RESET}") + print(f" {GREEN}gps {RESET} {_fmt_gps(msg.gps_fix, msg.heading)}") + print(f" {YELLOW}waypoint{RESET} {_fmt_waypoint(msg.waypoint)}") + print(f" {DIM}payloads{RESET} {payload_summary}") + print() + + print(f"{DIM}Listening on /gossip/peers — Ctrl+C to quit{RESET}") + + +def main(): + parser = argparse.ArgumentParser(description="Live peer registry monitor") + parser.add_argument("--robot", default="", help="Filter by robot name (partial)") + parser.add_argument("--rate", type=float, default=2.0, help="Refresh rate Hz (default 2)") + args = parser.parse_args() + + rclpy.init() + node = RegistryMonitor(filter_name=args.robot) + interval = 1.0 / max(args.rate, 0.1) + + try: + while rclpy.ok(): + rclpy.spin_once(node, timeout_sec=interval) + node.print_registry() + except KeyboardInterrupt: + pass + finally: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() diff --git a/common/ros_packages/coordination/coordination_bringup/launch/gcs_gossip_bridge.launch.py b/common/ros_packages/coordination/coordination_bringup/launch/gcs_gossip_bridge.launch.py new file mode 100644 index 000000000..499a217bc --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/launch/gcs_gossip_bridge.launch.py @@ -0,0 +1,27 @@ +"""Launches the GCS-side DDS Router bridging /gossip/peers between GCS domain (0) and gossip domain (99).""" + +import os + +from ament_index_python.packages import get_package_share_directory +from launch import LaunchDescription +from launch.actions import ExecuteProcess + + +def generate_launch_description(): + config = os.path.join( + get_package_share_directory('coordination_bringup'), + 'config', 'gcs_gossip_dds_router.yaml', + ) + return LaunchDescription([ + ExecuteProcess( + cmd=['ddsrouter', '-c', config], + env={ + **os.environ, + # ddsrouter runtime libs are installed under /usr/local/lib. + # Scope this path to ddsrouter to avoid changing ROS 2 RMW resolution. + 'LD_LIBRARY_PATH': '/usr/local/lib:' + os.environ.get('LD_LIBRARY_PATH', ''), + }, + output='screen', + name='gcs_gossip_dds_router', + ), + ]) diff --git a/common/ros_packages/coordination/coordination_bringup/launch/gossip.launch.xml b/common/ros_packages/coordination/coordination_bringup/launch/gossip.launch.xml new file mode 100644 index 000000000..c293be260 --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/launch/gossip.launch.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + diff --git a/common/ros_packages/coordination/coordination_bringup/package.xml b/common/ros_packages/coordination/coordination_bringup/package.xml new file mode 100644 index 000000000..f9482e5a8 --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/package.xml @@ -0,0 +1,24 @@ + + + + coordination_bringup + 0.0.0 + Gossip-protocol multi-agent coordination layer for AirStack + AirLab + BSD-3-Clause + + rclpy + nav_msgs + geometry_msgs + sensor_msgs + coordination_msgs + + ament_copyright + ament_flake8 + ament_pep257 + python3-pytest + + + ament_python + + diff --git a/common/ros_packages/coordination/coordination_bringup/resource/coordination_bringup b/common/ros_packages/coordination/coordination_bringup/resource/coordination_bringup new file mode 100644 index 000000000..e69de29bb diff --git a/common/ros_packages/coordination/coordination_bringup/scripts/gossip_node b/common/ros_packages/coordination/coordination_bringup/scripts/gossip_node new file mode 100755 index 000000000..8f7d7d464 --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/scripts/gossip_node @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 +from coordination_bringup.gossip_node import main +main() diff --git a/common/ros_packages/coordination/coordination_bringup/scripts/peer_registry_monitor b/common/ros_packages/coordination/coordination_bringup/scripts/peer_registry_monitor new file mode 100755 index 000000000..70ef2fcd4 --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/scripts/peer_registry_monitor @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 +from coordination_bringup.peer_registry_monitor import main +main() diff --git a/common/ros_packages/coordination/coordination_bringup/setup.py b/common/ros_packages/coordination/coordination_bringup/setup.py new file mode 100644 index 000000000..56df3cf88 --- /dev/null +++ b/common/ros_packages/coordination/coordination_bringup/setup.py @@ -0,0 +1,30 @@ +from setuptools import find_packages, setup + +package_name = 'coordination_bringup' + +setup( + name=package_name, + version='0.0.0', + packages=find_packages(exclude=['test']), + data_files=[ + ('share/ament_index/resource_index/packages', + ['resource/' + package_name]), + ('share/' + package_name, ['package.xml']), + ('share/' + package_name + '/launch', ['launch/gossip.launch.xml', 'launch/gcs_gossip_bridge.launch.py']), + ('share/' + package_name + '/config', ['config/gossip_dds_router.yaml', 'config/gossip_payloads.yaml', 'config/gcs_gossip_dds_router.yaml']), + ('lib/' + package_name, ['scripts/gossip_node', 'scripts/peer_registry_monitor']), + ], + install_requires=['setuptools'], + zip_safe=True, + maintainer='AirLab', + maintainer_email='airlab@andrew.cmu.edu', + description='Gossip-protocol multi-agent coordination layer for AirStack', + license='BSD-3-Clause', + tests_require=['pytest'], + entry_points={ + 'console_scripts': [ + 'gossip_node = coordination_bringup.gossip_node:main', + 'peer_registry_monitor = coordination_bringup.peer_registry_monitor:main', + ], + }, +) diff --git a/common/ros_packages/coordination/coordination_msgs/CMakeLists.txt b/common/ros_packages/coordination/coordination_msgs/CMakeLists.txt new file mode 100644 index 000000000..ffb9e8232 --- /dev/null +++ b/common/ros_packages/coordination/coordination_msgs/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.8) +project(coordination_msgs) + +find_package(ament_cmake REQUIRED) +find_package(rosidl_default_generators REQUIRED) +find_package(geometry_msgs REQUIRED) +find_package(sensor_msgs REQUIRED) + +rosidl_generate_interfaces(${PROJECT_NAME} + "msg/PeerProfilePayload.msg" + "msg/PeerProfile.msg" + DEPENDENCIES geometry_msgs sensor_msgs +) + +ament_export_dependencies(rosidl_default_runtime) +ament_package() diff --git a/common/ros_packages/coordination/coordination_msgs/msg/PeerProfile.msg b/common/ros_packages/coordination/coordination_msgs/msg/PeerProfile.msg new file mode 100644 index 000000000..a1951c00e --- /dev/null +++ b/common/ros_packages/coordination/coordination_msgs/msg/PeerProfile.msg @@ -0,0 +1,27 @@ +# Gossip-protocol peer state broadcast. +# Each robot publishes one of these on /gossip/peers at its own rate. + +# Identity +string robot_name + +# Current GPS position from interface/mavros/global_position/raw/fix. +# gps_fix.header.stamp is overwritten by gossip_node at publish time +# (ROS clock, not the original NavSatFix sensor stamp) so receivers can +# enforce monotonic dedup / ordering across gossip ticks. +sensor_msgs/NavSatFix gps_fix + +# Heading in degrees clockwise from North (0-360), from compass_hdg +float64 heading + +# Current navigation goal – last pose in the global planner's published path. +# All-zero header stamp signals that no plan is available yet (null waypoint). +geometry_msgs/PoseStamped waypoint + +# Gossip metadata +uint8 SOURCE_DIRECT = 0 +uint8 SOURCE_RELAYED = 1 +uint8 source # how this message reached us (unused in Phase 1, reserved) +uint8 relay_hops # number of relay hops (unused in Phase 1, reserved) + +# Arbitrary typed payloads – any number, any ROS message type +PeerProfilePayload[] payloads diff --git a/common/ros_packages/coordination/coordination_msgs/msg/PeerProfilePayload.msg b/common/ros_packages/coordination/coordination_msgs/msg/PeerProfilePayload.msg new file mode 100644 index 000000000..870b541fd --- /dev/null +++ b/common/ros_packages/coordination/coordination_msgs/msg/PeerProfilePayload.msg @@ -0,0 +1,10 @@ +# A single typed payload carried inside a PeerProfile. +# payload_type holds the fully-qualified ROS message type string, +# e.g. "nav_msgs/msg/OccupancyGrid". payload_data holds the +# serialized bytes produced by rclpy.serialization.serialize_message(). +# stamp is the time the source topic was last received — receivers can use +# this to detect stale payloads independently of the gossip message timestamp. +builtin_interfaces/Time stamp +string payload_name +string payload_type +uint8[] payload_data diff --git a/common/ros_packages/coordination/coordination_msgs/package.xml b/common/ros_packages/coordination/coordination_msgs/package.xml new file mode 100644 index 000000000..27108aa04 --- /dev/null +++ b/common/ros_packages/coordination/coordination_msgs/package.xml @@ -0,0 +1,23 @@ + + + + coordination_msgs + 0.0.0 + Custom message definitions for multi-agent gossip coordination layer + AirLab + BSD-3-Clause + + ament_cmake + rosidl_default_generators + + geometry_msgs + sensor_msgs + + rosidl_interface_packages + + rosidl_default_runtime + + + ament_cmake + + diff --git a/common/ros_packages/desktop_bringup/launch/gcs.launch.xml b/common/ros_packages/desktop_bringup/launch/gcs.launch.xml index f0876e990..aabf229ca 100644 --- a/common/ros_packages/desktop_bringup/launch/gcs.launch.xml +++ b/common/ros_packages/desktop_bringup/launch/gcs.launch.xml @@ -12,6 +12,36 @@ ?> + + + ?> + + + + + + + + + + + + + + + + + + + + + + - \ No newline at end of file + diff --git a/common/ros_packages/desktop_bringup/launch/robot.launch.xml b/common/ros_packages/desktop_bringup/launch/robot.launch.xml index 2d92337bb..83e4ab352 100644 --- a/common/ros_packages/desktop_bringup/launch/robot.launch.xml +++ b/common/ros_packages/desktop_bringup/launch/robot.launch.xml @@ -2,7 +2,8 @@ - + + @@ -11,8 +12,7 @@ - - + diff --git a/common/ros_packages/desktop_bringup/package.xml b/common/ros_packages/desktop_bringup/package.xml index c55551077..785d54734 100644 --- a/common/ros_packages/desktop_bringup/package.xml +++ b/common/ros_packages/desktop_bringup/package.xml @@ -13,6 +13,11 @@ tf2_ros xacro urdf + rclpy + visualization_msgs + action_relay + coordination_bringup + gcs_visualizer ament_lint_auto ament_lint_common diff --git a/common/ros_packages/desktop_bringup/params/domain_bridge.yaml b/common/ros_packages/desktop_bringup/params/domain_bridge.yaml index c07a5acb7..3eb8c3300 100644 --- a/common/ros_packages/desktop_bringup/params/domain_bridge.yaml +++ b/common/ros_packages/desktop_bringup/params/domain_bridge.yaml @@ -97,6 +97,8 @@ topics: type: sensor_msgs/msg/BatteryState from_domain: 3 to_domain: 0 + + # Bridge "/clock" topic from doman ID 2 to domain ID 3, # Override durability to be 'volatile' and override depth to be 1 diff --git a/common/ros_packages/desktop_bringup/rviz/robot.rviz b/common/ros_packages/desktop_bringup/rviz/robot.rviz index b6c494c98..e231c869e 100644 --- a/common/ros_packages/desktop_bringup/rviz/robot.rviz +++ b/common/ros_packages/desktop_bringup/rviz/robot.rviz @@ -31,10 +31,6 @@ Panels: Name: Time SyncMode: 0 SyncSource: Foreground Background Cloud - - Class: rviz_behavior_tree_panel::BehaviorTreePanel - Name: BehaviorTreePanel - topic: /robot_1/behavior/behavior_tree_graphviz - zoom_factor: 0.1919851303100586 - Class: rviz_tasks_panel::TasksPanel Name: TasksPanel executor_0: tasks/takeoff @@ -74,7 +70,7 @@ Visualization Manager: Frame Timeout: 15 Frames: All Enabled: false - OS1_REV6_128_10hz___512_resolution: + lidar_mount: Value: false base_link: Value: true @@ -92,7 +88,7 @@ Visualization Manager: Value: true imu: Value: false - lidar: + ouster: Value: false look_ahead_point: Value: true @@ -130,8 +126,8 @@ Visualization Manager: map: base_link: base_link_body_body_link: - OS1_REV6_128_10hz___512_resolution: - lidar: + lidar_mount: + ouster: {} base_link_ZED_X: camera_left: @@ -182,7 +178,7 @@ Visualization Manager: Expand Link Details: false Expand Tree: false Link Tree Style: Links in Alphabetic Order - OS1_REV6_128_10hz___512_resolution: + lidar_mount: Alpha: 1 Show Axes: false Show Trail: false @@ -213,7 +209,7 @@ Visualization Manager: Alpha: 1 Show Axes: false Show Trail: false - lidar: + ouster: Alpha: 1 Show Axes: false Show Trail: false @@ -304,7 +300,7 @@ Visualization Manager: Durability Policy: Volatile History Policy: Keep Last Reliability Policy: Reliable - Value: sensors/lidar/point_cloud + Value: sensors/ouster/point_cloud Use Fixed Frame: true Use rainbow: true Value: false @@ -857,8 +853,6 @@ Visualization Manager: Yaw: 2.878566026687622 Saved: ~ Window Geometry: - BehaviorTreePanel: - collapsed: false Displays: collapsed: false Front Right Depth: diff --git a/common/ros_packages/gui/rviz/rviz_tasks_panel/README.md b/common/ros_packages/gui/rviz/rviz_tasks_panel/README.md index 9613d38aa..4dcad9f26 100644 --- a/common/ros_packages/gui/rviz/rviz_tasks_panel/README.md +++ b/common/ros_packages/gui/rviz/rviz_tasks_panel/README.md @@ -12,7 +12,7 @@ with live feedback and result display. ## Overview The Tasks Panel replaces CLI-based action goal dispatch with a -graphical interface for all 9 AirStack task types. Each task type +graphical interface for all 8 AirStack task types. Each task type gets its own tab with auto-generated parameter widgets, an executor selector, and a feedback/result view. @@ -41,7 +41,7 @@ separate waypoint panel needed. ## Features -- **9 task tabs** with auto-generated goal parameter widgets +- **8 task tabs** with auto-generated goal parameter widgets - **Executor discovery** -- scans ROS 2 topics every 5 seconds to find running action servers - **Robot namespace selector** -- auto-populated from discovered @@ -80,7 +80,7 @@ in the air. | Navigate | `NavigateTask` | `global_plan` (Path), `goal_tolerance_m` | ✓ | | Exploration | `ExplorationTask` | `search_bounds` (Polygon), altitude/speed, `time_limit_sec` | ✓ | | Coverage | `CoverageTask` | `coverage_area` (Polygon), `line_spacing_m`, `heading_deg` | ✓ | -| Semantic Search | `SemanticSearchTask` | `query`, `search_area`, `confidence_threshold`, `target_count` | ✓ | +| Semantic Search | `SemanticSearchTask` | `query`, `background_queries`, `search_area`, `confidence_threshold` | ✓ | | Chat | `ChatTask` | `text`, `images` (file upload) | | | Fixed Trajectory | `FixedTrajectoryTask` | `trajectory_spec`, `loop` | ✓ | @@ -101,7 +101,7 @@ in the air. - `rviz_common` -- RViz2 panel base class - `pluginlib` -- plugin loading - `rclcpp` / `rclcpp_action` -- ROS 2 node and action client -- `task_msgs` -- action definitions for all 9 task types +- `task_msgs` -- action definitions for all 8 task types - `airstack_msgs` -- `FixedTrajectory` message - `geometry_msgs` / `nav_msgs` / `std_msgs` -- standard message types - `diagnostic_msgs` / `action_msgs` -- status introspection diff --git a/common/ros_packages/gui/rviz/rviz_tasks_panel/src/tasks_panel.cpp b/common/ros_packages/gui/rviz/rviz_tasks_panel/src/tasks_panel.cpp index 0c9376b08..861f42070 100644 --- a/common/ros_packages/gui/rviz/rviz_tasks_panel/src/tasks_panel.cpp +++ b/common/ros_packages/gui/rviz/rviz_tasks_panel/src/tasks_panel.cpp @@ -58,14 +58,13 @@ std::vector TasksPanel::getTaskDefs() }, true}, {"Semantic Search", "tasks/semantic_search", { {"query", "string", 0, 0, 0}, + {"background_queries", "string", 0, 0, 0}, {"search_area", "geometry_msgs/Polygon", 0, 0, 0}, {"min_altitude_agl", "float32", 3.0, 0.0, 500.0}, - {"max_altitude_agl", "float32", 10.0, 0.0, 500.0}, + {"max_altitude_agl", "float32", 15.0, 0.0, 500.0}, {"min_flight_speed", "float32", 1.0, 0.0, 50.0}, {"max_flight_speed", "float32", 3.0, 0.0, 50.0}, - {"time_limit_sec", "float32", 120.0, 0.0, 86400.0}, - {"confidence_threshold", "float32", 0.5, 0.0, 1.0}, - {"target_count", "int32", 1, 0, 10000}, + {"confidence_threshold", "float32", 0.95, 0.0, 1.0}, }, true}, {"Chat", "tasks/chat", { {"text", "text", 0, 0, 0}, @@ -1102,31 +1101,22 @@ void TasksPanel::onExecuteClicked() case 6: { // Semantic Search task_msgs::action::SemanticSearchTask::Goal goal; goal.query = getString(6, "query"); + goal.background_queries = getString(6, "background_queries"); goal.search_area = getPolygon(6, "search_area"); goal.min_altitude_agl = getFloat(6, "min_altitude_agl"); goal.max_altitude_agl = getFloat(6, "max_altitude_agl"); goal.min_flight_speed = getFloat(6, "min_flight_speed"); goal.max_flight_speed = getFloat(6, "max_flight_speed"); - goal.time_limit_sec = getFloat(6, "time_limit_sec"); goal.confidence_threshold = getFloat(6, "confidence_threshold"); - goal.target_count = getInt(6, "target_count"); doSendGoal(6, goal, [](const auto & fb) { - return QString("status: %1 | progress: %2 | best_conf: %3 | found: %4 | pos: (%5, %6, %7)") - .arg(QString::fromStdString(fb.status)) - .arg(fb.progress, 0, 'f', 2) - .arg(fb.best_confidence_so_far, 0, 'f', 3) - .arg(fb.objects_found_so_far) - .arg(fb.current_position.x, 0, 'f', 1) - .arg(fb.current_position.y, 0, 'f', 1) - .arg(fb.current_position.z, 0, 'f', 1); + return QString::fromStdString(fb.status); }, [](const auto & r) { - return QString("success: %1\nmessage: %2\nconfidence: %3\nobjects_found: %4") + return QString("success: %1\nmessage: %2\nconfidence: %3") .arg(r->success ? "true" : "false") .arg(QString::fromStdString(r->message)) - .arg(r->confidence, 0, 'f', 3) - .arg(r->objects_found); + .arg(r->confidence, 0, 'f', 3); }); break; } diff --git a/common/ros_packages/msgs/task_msgs/action/FixedTrajectoryTask.action b/common/ros_packages/msgs/task_msgs/action/FixedTrajectoryTask.action index 72dfce8cb..ce8b63bb4 100644 --- a/common/ros_packages/msgs/task_msgs/action/FixedTrajectoryTask.action +++ b/common/ros_packages/msgs/task_msgs/action/FixedTrajectoryTask.action @@ -1,5 +1,5 @@ # Follow a fixed trajectory specified by type and parameters. -# loop: if true, repeat the trajectory indefinitely until cancelled +# loop: if true, repeat the trajectory indefinitely until canceled # Goal airstack_msgs/FixedTrajectory trajectory_spec diff --git a/common/ros_packages/msgs/task_msgs/action/SemanticSearchTask.action b/common/ros_packages/msgs/task_msgs/action/SemanticSearchTask.action index abfbd4ea8..7c9bf60a3 100644 --- a/common/ros_packages/msgs/task_msgs/action/SemanticSearchTask.action +++ b/common/ros_packages/msgs/task_msgs/action/SemanticSearchTask.action @@ -1,19 +1,18 @@ # Search an area for a location or object described in natural language. # query: natural-language description of the target (e.g. "red car", "person waving") -# confidence_threshold: minimum match confidence to report a result (0.0-1.0) -# time_limit_sec: maximum task duration in seconds (0 = no limit) -# target_count: stop after finding this many matches (0 = find all within area/time) +# confidence_threshold: minimum match confidence to report a result (0.0-1.0, default 0.95) +# background_queries: optional comma-separated contrast classes for softmax +# normalization (e.g. "building,tree,ground"). Empty means no contrast set. # Goal string query +string background_queries geometry_msgs/Polygon search_area -float32 min_altitude_agl -float32 max_altitude_agl +float32 min_altitude_agl 3.0 +float32 max_altitude_agl 15.0 float32 min_flight_speed float32 max_flight_speed -float32 time_limit_sec -float32 confidence_threshold -int32 target_count +float32 confidence_threshold 0.95 --- # Result bool success diff --git a/common/ros_packages/msgs/task_msgs/package.xml b/common/ros_packages/msgs/task_msgs/package.xml index feeb41d95..8440930c2 100644 --- a/common/ros_packages/msgs/task_msgs/package.xml +++ b/common/ros_packages/msgs/task_msgs/package.xml @@ -5,7 +5,7 @@ 0.0.0 ROS 2 action message definitions for AirStack task executors AirLab CMU - TODO: License declaration + BSD-3-Clause ament_cmake rosidl_default_generators diff --git a/common/ros_packages/robot_descriptions/CMakeLists.txt b/common/ros_packages/robot_descriptions/CMakeLists.txt index e06c17c99..616ef402f 100644 --- a/common/ros_packages/robot_descriptions/CMakeLists.txt +++ b/common/ros_packages/robot_descriptions/CMakeLists.txt @@ -24,4 +24,5 @@ endif() install(DIRECTORY iris DESTINATION share/${PROJECT_NAME}) install(DIRECTORY launch DESTINATION share/${PROJECT_NAME}) + ament_package() diff --git a/common/ros_packages/robot_descriptions/iris/urdf/iris_with_sensors.pegasus.robot.urdf b/common/ros_packages/robot_descriptions/iris/urdf/iris_with_sensors.pegasus.robot.urdf index ebea35166..fed2decde 100644 --- a/common/ros_packages/robot_descriptions/iris/urdf/iris_with_sensors.pegasus.robot.urdf +++ b/common/ros_packages/robot_descriptions/iris/urdf/iris_with_sensors.pegasus.robot.urdf @@ -5,10 +5,10 @@ - + - + @@ -39,10 +39,10 @@ - - - - + + + + @@ -59,7 +59,7 @@ - + @@ -105,7 +105,7 @@ - + diff --git a/common/ros_packages/robot_descriptions/launch/robot_state_publisher.launch.py b/common/ros_packages/robot_descriptions/launch/robot_state_publisher.launch.py index 474614cb1..6e2f4fac5 100644 --- a/common/ros_packages/robot_descriptions/launch/robot_state_publisher.launch.py +++ b/common/ros_packages/robot_descriptions/launch/robot_state_publisher.launch.py @@ -49,9 +49,9 @@ def launch_setup(context, *args, **kwargs): relative_path ]) else: - # Use relative path within robot_bringup package urdf_file = PathJoinSubstitution([ - FindPackageShare('robot_bringup'), + FindPackageShare('robot_descriptions'), + 'iris', 'urdf', urdf_file_path ]) @@ -94,8 +94,8 @@ def generate_launch_description(): urdf_file_path_arg = DeclareLaunchArgument( 'urdf_file_path', - default_value='robot.urdf.xacro', - description='Path to the URDF/xacro file. Can be relative to robot_bringup/urdf/ or an absolute path' + default_value='iris_with_sensors.pegasus.robot.urdf', + description='Path to the URDF/xacro file. Bare filename → robot_descriptions/iris/urdf/.' ) publish_frequency_arg = DeclareLaunchArgument( diff --git a/docs/development/intermediate/contributing.md b/docs/development/intermediate/contributing.md index 421d50d4c..1c0716acd 100644 --- a/docs/development/intermediate/contributing.md +++ b/docs/development/intermediate/contributing.md @@ -33,10 +33,77 @@ Launches docs on https://localhost:8000. index.md # The documentation homepage. ... # Other markdown pages, images and other files. +## Branching Strategy + +This project follows a [Gitflow](https://nvie.com/posts/a-successful-git-branching-model/)-inspired branching model with two long-lived branches: + +- **`main`** — always reflects production-ready code. Only receives merges from `develop` (releases) and `hotfix/*` branches (urgent fixes). +- **`develop`** — the integration branch where all new features and non-urgent fixes are merged. + +### Feature branches + +For new features and non-urgent bug fixes, branch off `develop`: + +```bash +git checkout develop +git checkout -b feature/my-feature +``` + +Open your pull request targeting `develop`. PRs targeting `main` from non-hotfix branches will be automatically rejected (see [Branch Enforcement](#branch-enforcement) below). + +### Hotfix branches + +For urgent fixes that must go directly to production, branch off `main`: + +```bash +git checkout main +git checkout -b hotfix/my-fix +``` + +Open your pull request targeting `main`. After it merges, the fix is automatically synced back to `develop` (see [Automatic Sync](#automatic-sync-main--develop) below). + +## Branch Enforcement + +A GitHub Actions workflow (`.github/workflows/enforce-branch-targets.yml`) runs on every pull request and enforces the following rules: + +| Source branch | Allowed target | Blocked target | +|---|---|---| +| `feature/*`, `fix/*`, or any non-hotfix branch | `develop` | `main` | +| `hotfix/*` | `main` | `develop` | +| `develop` | `main` | — | +| `main` | `develop` | — | + +If your PR targets the wrong base branch, the check will fail with a message explaining the violation. To fix it, close the PR and reopen it against the correct base branch. + +## Automatic Sync: main → develop + +To keep the git histories of `main` and `develop` related, a GitHub Actions workflow (`.github/workflows/sync-develop-from-main.yaml`) merges `main` back into `develop` and pushes directly after every push to `main`. The workflow bypasses `develop`'s ruleset using a `SYNC_PAT` secret owned by a Repository admin. This ensures that release merge commits and hotfixes are always present in `develop`'s history, preventing divergence and conflicts in future releases. + +### VERSION handling on develop + +`develop` always carries a pre-release VERSION (e.g. `0.19.0-alpha.3`) so that it stays strictly greater than `main` and satisfies the `Verify VERSION is valid and incremented` check (`.github/workflows/check-version-increment.yml`), which requires every PR to bump `.env`'s `VERSION` above its base branch. The sync workflow bumps `develop`'s VERSION as part of the merge using two rules: + +| Condition | Action | Example | +|---|---|---| +| `main`'s `x.y.z` ≥ `develop`'s base `x.y.z` (a release just landed on main) | Roll `develop` to the next minor's `alpha.0` | main `0.19.0`, develop `0.19.0-alpha.7` → develop `0.20.0-alpha.0` | +| `main`'s `x.y.z` < `develop`'s base (a hotfix landed on main) | Preserve `develop`'s pre-release channel and bump the counter | main `0.19.1`, develop `0.20.0-alpha.0` → develop `0.20.0-alpha.1` | + +The workflow auto-resolves conflicts on the `VERSION=` line of `.env` (keeps `develop`'s side, then applies the bump). Any other merge conflict aborts the sync and must be resolved manually: + +```bash +git checkout -B sync/main-to-develop origin/develop +git merge origin/main +# resolve conflicts +# manually bump VERSION in .env per the rules above +git commit +git push --force-with-lease origin sync/main-to-develop +# then open / update the PR targeting develop +``` + ## Merge -Submit a pull request. +Submit a pull request to the appropriate base branch per the [Branching Strategy](#branching-strategy) above. All tests must pass before merging. -Regression tests are run so that we don't break anything. \ No newline at end of file +Regression tests are run so that we don't break anything. diff --git a/docs/development/intermediate/testing/fixed_trajectory_testing.md b/docs/development/intermediate/testing/fixed_trajectory_testing.md new file mode 100644 index 000000000..8d5ca8491 --- /dev/null +++ b/docs/development/intermediate/testing/fixed_trajectory_testing.md @@ -0,0 +1,511 @@ +# Fixed-Trajectory Path-Tracker Benchmark + +This guide documents the **fixed-trajectory evaluation test suite** (`tests/test_fixed_trajectory.py`): why it exists, how it is implemented, how to run it, how to interpret results, and how to use it to **compare path trackers** without rewriting tests. + +For the broader system-test suite, see [`tests/README.md`](../../../../tests/README.md). + +--- + +## Purpose + +AirStack's local controls stack separates **reference-path generation**, **path tracking**, and **low-level control**: + +```mermaid +flowchart LR + FT[FixedTrajectoryTask] --> TL[trajectory_library] + TL --> TC[trajectory_controller
path tracker] + TC -->|~/tracking_point| PID[pid_controller] + PID --> FC[Flight computer / PX4] + FC --> ODOM[local_position/odom] + ODOM --> TC +``` + +The benchmark harness holds the **reference trajectory** and **flight procedure** constant so maintainers can: + +- **Swap or retune path trackers** and measure the same metrics every time. +- **Compare execution time** — how long does a standard pattern take in sim-time? +- **Compare tracking error** — mean/max cross-track error and path RMSE against a known ideal path. +- **Detect regressions** — action timeouts, stalls, or catastrophic drift via `trajectory_success` and assertion thresholds. + +Today the default tracker is the **sphere-intersection pure-pursuit** implementation in `trajectory_controller` + `trajectory_library`. The downstream `pid_controller` is held fixed so changes isolate tracker behavior. A different tracker can replace the `trajectory_controller` node (or its parameters) in launch; the pytest module does not need to change as long as `FixedTrajectoryTask` and odom topics remain the same. + +--- + +## What gets tested + +### Test module + +| Item | Value | +| ---- | ----- | +| File | [`tests/test_fixed_trajectory.py`](../../../../tests/test_fixed_trajectory.py) | +| Pytest mark | `autonomy` | +| Class | `TestFixedTrajectory` | +| Timeout | 2400 s per test class invocation | + +### Parametrization + +Each run sweeps: + +``` +(sim, num_robots, iteration, trajectory_type) +``` + +| Parameter | CLI flag | Default | +| --------- | -------- | ------- | +| Simulator | `--sim` | `msairsim,isaacsim` | +| Robot count | `--num-robots` | `1,3` | +| Repeat count | `--stress-iterations` | `1` | +| Trajectory type | `--trajectory-types` | `Circle,Figure8,Racetrack,Line` | + +!!! tip "Pin your sweep for local runs" + Defaults multiply configs and run for hours. For development, always set explicit values: + + ```bash + airstack test -m autonomy \ + --sim isaacsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --trajectory-types Circle \ + -v + ``` + +### Four-phase flight chain + +For every `(sim, num_robots, iteration, trajectory_type)` tuple the drone runs: + +| Phase | Test | Action | Pass criteria | +| ----- | ---- | ------ | ------------- | +| 1 | `test_px4_ready` | Wait for MAVROS + odom | All robots connected and publishing within 300 s wall-clock | +| 2 | `test_takeoff` | `TakeoffTask` to 10 m @ 1 m/s | Steady-state altitude within ±10% of target | +| 3 | `test_fixed_trajectory` | `FixedTrajectoryTask` | Cross-track mean < 5 m; records success + timing | +| 4 | `test_landing` | `LandTask` @ 1 m/s | Final altitude < 0.5 m | + +```mermaid +stateDiagram-v2 + [*] --> PX4Ready + PX4Ready --> Takeoff + Takeoff --> ExecuteTrajectory + ExecuteTrajectory --> Land : always + ExecuteTrajectory --> Land : even on trajectory failure + Land --> [*] + Takeoff --> Poisoned : takeoff fails + Land --> Poisoned : landing fails + Poisoned --> [*] : skip remaining types in env +``` + +**Chain guard:** a failure in phase 3 (`test_fixed_trajectory`) does **not** poison the environment — landing always runs so the drone returns to the ground before the next trajectory type. Failures in takeoff or landing **do** poison the env and skip subsequent trajectory types for that `(sim, num_robots, iteration)`. + +Phase 1 (`test_px4_ready`) runs once per env regardless of how many trajectory types are swept. + +--- + +## Reference trajectories + +The test uses the same patterns as the `FixedTrajectoryTask` action server in `trajectory_controller` (`fixed_trajectory_task.cpp`). Default parameters are defined in `TRAJECTORY_CONFIGS` inside `test_fixed_trajectory.py` and must stay in sync with the C++ generators. + +| Type | Parameters | Approx. path length | Expected sim-time* | +| ---- | ---------- | ------------------- | ------------------ | +| **Circle** | radius=10 m, velocity=2 m/s | ~63 m loop + return segments | **~45–50 s** | +| **Figure8** | length=15 m, width=8 m, v=2 m/s, max_accel=1 m/s² | ~100+ m | **~50–70 s** | +| **Racetrack** | length=30 m, width=10 m, v=3 m/s, turn_v=1.5 m/s | ~80+ m | **~30–50 s** | +| **Line** | length=20 m, v=2 m/s, max_accel=1 m/s² | 20 m | **~12–15 s** | + +\*Sim-time from odom timestamps; wall-clock varies with sim real-time factor (RTF). + +### Circle geometry (ideal path) + +Python `_ideal_circle()` mirrors `generate_circle()` in C++: + +- Start at origin, move to `(radius, 0, 0)`. +- Trace the circle in 10° steps. +- Return to `(radius, 0, 0)` then origin. + +The trajectory is defined in **`base_link`** at dispatch; the test transforms it to **world frame** using the robot pose snapshot (see below). + +--- + +## Metrics + +All metrics are recorded per robot as `robot_N.` in `tests/results//metrics.json` and rolled up into `summary.txt`. + +### Flight metrics + +| Key | Unit | Better | Description | +| --- | ---- | ------ | ----------- | +| `ready_duration_sys_s` | s | lower | Wall-clock time until PX4/MAVROS ready | +| `takeoff_duration_sim_s` | s | lower | Sim-time from first motion to 95% of 10 m target | +| `altitude_error_m` | m | lower | Signed steady-state altitude error after takeoff | +| `overshoot_m` | m | lower | Unsigned overshoot above 10 m | +| `trajectory_success` | — | **higher** | `1.0` if action returned `success: true`, else `0.0` | +| `trajectory_execution_time_sim_s` | s | lower | Sim-time from action dispatch to completion | +| `cross_track_error_mean_m` | m | lower | Mean 2-D lateral distance to nearest ideal point | +| `cross_track_error_max_m` | m | lower | Worst 2-D lateral deviation | +| `path_rmse_m` | m | lower | 2-D RMSE against ideal polyline | +| `land_duration_sim_s` | s | lower | Sim-time from 80% peak descent to < 0.5 m | +| `final_altitude_m` | m | lower | Altitude when landing action completes | + +### How to read metrics when comparing trackers + +| Observation | Likely meaning | +| ----------- | -------------- | +| High `cross_track_error_max_m`, moderate mean | Turn/corner lag (common on Circle) | +| High mean and max | Tracker not keeping up or wrong frame | +| Long `trajectory_execution_time_sim_s` at same velocity | Virtual time stalling behind the robot | +| `trajectory_success = 0` | Action timed out or aborted — fix before interpreting error | +| Good mean, bad max | Occasional spikes — check sphere intersection on curves | + +### Observed baseline (Circle, Isaac Sim, 10 headless runs) + +Validated on branch `pkumaraTrajectoryTesting` — see `tests/results/2026-06-05_18-26-52/summary.txt`: + +| Metric | Typical value | +| ------ | ------------- | +| Tests | 40 passed / 0 failed (10 iter × 4 phases) | +| `trajectory_success` | yes (every run) | +| `trajectory_execution_time_sim_s` | ~46 s | +| `cross_track_error_mean_m` | ~0.98 m | +| `cross_track_error_max_m` | ~5.0 m | +| `path_rmse_m` | ~1.55 m | +| `final_altitude_m` | < 0.05 m | + +The assertion tolerance is **`CROSS_TRACK_TOLERANCE_M = 5.0`** in `test_fixed_trajectory.py` — intentionally loose while the default tracker matures. Tighten this constant as tracking improves. + +--- + +## Cross-track error algorithm + +The test measures **end-to-end** tracking (tracker + PID + sim physics), not the tracker in isolation. + +### Steps + +1. **Snapshot pose** — immediately before sending `FixedTrajectoryTask`, read one odom sample: `(x₀, y₀, z₀, yaw₀)`. +2. **Build ideal path** — generate waypoints in `base_link` using the same equations as C++ (`_ideal_circle`, `_ideal_figure8`, etc.). +3. **Transform to world** — rotate by `yaw₀` and translate by `(x₀, y₀, z₀)`. +4. **Capture odom** — background `ros2 topic echo --csv` on `/robot_N/interface/mavros/local_position/odom` for the action duration (timeout 180 s). +5. **Compute error** — for each odom sample, find the nearest ideal waypoint in **XY**; record distance statistics. + +Altitude is not part of cross-track error (these patterns are flat; altitude is checked at takeoff). + +### Why world-frame alignment matters + +`FixedTrajectoryTask` publishes the path in `base_link` relative to the robot at dispatch. Without transforming the ideal path to world frame, odom (world-fixed) would be compared against the wrong reference and error would be meaningless. + +--- + +## Results pipeline + +Every `airstack test` run writes: + +``` +tests/results// +├── summary.txt ← open this first (human-readable) +├── results.xml ← JUnit pass/fail + durations +└── metrics.json ← structured metrics for diff tools +``` + +| Artifact | Producer | Use | +| -------- | -------- | --- | +| `summary.txt` | `tests/run_summary.py` (auto at session end via `conftest.py`) | Quick pass/fail + key numbers per trajectory type | +| `results.xml` | pytest `--junitxml` | CI, phase wall times | +| `metrics.json` | `MetricsRecorder` in `conftest.py` | Regression diffs | + +### Regenerate or inspect + +```bash +# Latest run +LATEST=$(ls -1t tests/results/ | head -1) + +# Human summary +cat "tests/results/$LATEST/summary.txt" + +# Regenerate summary manually +python3 tests/run_summary.py "tests/results/$LATEST/" + +# Markdown table of all metrics +python3 tests/parse_metrics.py --current "tests/results/$LATEST/" + +# Compare two tracker configs +python3 tests/parse_metrics.py \ + --current "tests/results/$NEW/" \ + --baseline "tests/results/$OLD/" \ + --threshold 20 \ + --output report.md +``` + +`parse_metrics.py` exits **1** when any metric regresses beyond the threshold percentage. + +--- + +## Running tests (complete CLI reference) + +### Prerequisites + +```bash +cd /path/to/AirStack +airstack setup +``` + +Required: + +- Docker daemon (user in `docker` group) +- NVIDIA GPU + `nvidia-container-toolkit` for sim tests +- Isaac Sim: `simulation/isaac-sim/docker/omni_pass.env` configured + +### Primary interface + +```bash +airstack test [pytest options] +``` + +All arguments are forwarded to pytest inside the containerized test runner (`tests/docker/`). + +### Rebuild after C++ changes + +```bash +airstack test -m build_packages -v +``` + +Always run this after modifying `trajectory_controller`, `trajectory_library`, or launch params before flight tests. + +### Fixed-trajectory commands + +```bash +# Quick Circle regression (recommended smoke test) +airstack test -m "build_packages or autonomy" \ + --sim isaacsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --trajectory-types Circle \ + -v + +# All four trajectory types, ms-airsim +airstack test -m autonomy \ + --sim msairsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --trajectory-types Circle,Figure8,Racetrack,Line \ + -v + +# Stress: 10 iterations (statistical stability) +airstack test -m autonomy \ + --sim isaacsim \ + --num-robots 1 \ + --stress-iterations 10 \ + --trajectory-types Circle \ + -v + +# Visual debug (sim GUI) +airstack test -m autonomy \ + --sim isaacsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --trajectory-types Circle \ + --gui \ + -v + +# Run only the trajectory phase (debugging) +airstack test -m autonomy \ + --sim isaacsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --trajectory-types Circle \ + -k test_fixed_trajectory \ + -v +``` + +### Global CLI options + +| Option | Default | Description | +| ------ | ------- | ----------- | +| `--sim` | `msairsim,isaacsim` | Comma-separated sim targets | +| `--num-robots` | `1,3` | Comma-separated robot counts | +| `--stress-iterations` | `1` | Repeat count per `(sim, num_robots)` | +| `--trajectory-types` | `Circle,Figure8,Racetrack,Line` | Trajectory sweep | +| `--gui` | off | Show simulator windows | +| `-v` | — | Verbose pytest | +| `-k EXPR` | — | Filter test names | + +### Direct pytest (local Python env) + +For faster iteration when editing test code: + +```bash +export AIRSTACK_ROOT=$(pwd) +pip install -r tests/requirements.txt + +pytest tests/ -m autonomy \ + --sim isaacsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --trajectory-types Circle \ + -v +``` + +### CI: `/pytest` PR comment + +Core contributors can trigger runs by commenting on the PR: + +``` +/pytest -m "build_packages or autonomy" --sim isaacsim --num-robots 1 --stress-iterations 1 --trajectory-types Circle -v +``` + +The workflow auto-prepends `build_packages` when not already specified. + +--- + +## Comparing path trackers + +### What to change + +| Layer | Location | Examples | +| ----- | -------- | -------- | +| Tracker params | `robot/ros_ws/src/local/local_bringup/launch/local.launch.xml` (or `local_droan_cpu.launch.xml`) | `sphere_radius`, `look_ahead_time`, `search_ahead_factor`, `min_virtual_tracking_velocity` | +| Tracker implementation | Replace or fork `trajectory_controller` node | Alternative pure-pursuit, different intersection logic | +| Low-level control | Swap `pid_controller` for `attitude_controller` in launch | Changes end-to-end error, not tracker-only | + +Key `trajectory_controller` parameters today: + +| Param | Current value | Role | +| ----- | ------------- | ---- | +| `sphere_radius` | `2.0` | Lookahead sphere radius (m) | +| `look_ahead_time` | `1.0` | Look-ahead horizon for local planner feed | +| `virtual_tracking_ahead_time` | `0.5` | Virtual tracking search window | +| `min_virtual_tracking_velocity` | `0.5` | Below this, time-advance mode instead of sphere mode | +| `search_ahead_factor` | `1.5` | Multiplier on sphere radius when searching intersection | + +### Recommended A/B workflow + +```bash +# 1. Baseline run +airstack test -m "build_packages or autonomy" \ + --sim isaacsim --num-robots 1 --stress-iterations 5 \ + --trajectory-types Circle -v +BASELINE=$(ls -1t tests/results/ | head -1) + +# 2. Edit tracker params in local.launch.xml, rebuild +airstack test -m build_packages -v + +# 3. Candidate run +airstack test -m autonomy \ + --sim isaacsim --num-robots 1 --stress-iterations 5 \ + --trajectory-types Circle -v +CURRENT=$(ls -1t tests/results/ | head -1) + +# 4. Diff +python3 tests/parse_metrics.py \ + --current "tests/results/$CURRENT/" \ + --baseline "tests/results/$BASELINE/" \ + --threshold 20 +``` + +Focus on: `cross_track_error_mean_m`, `cross_track_error_max_m`, `path_rmse_m`, `trajectory_execution_time_sim_s`, `trajectory_success`. + +--- + +## Path tracker bug fixes (this PR) + +The benchmark exposed failures in the default sphere-intersection tracker. Fixes included: + +### 1. Wrong first-segment sphere test (`trajectory_library.cpp`) + +`get_waypoint_sphere_intersection()` checked whether the **end** of the first segment was inside the sphere, not the **interpolated point at `initial_time`**. On curved paths the robot's projection often lies mid-segment, causing false "no intersection" results. + +**Fix:** interpolate `wp_start` to `initial_time`, then test distance from that point to the sphere center. + +### 2. Controller stall (`trajectory_controller.cpp`) + +When intersection failed, `virtual_time` could freeze and the tracking point collapsed onto the robot — the drone **stalled on closed loops** (Circle). + +**Fixes:** + +- Fallback to `get_waypoint_distance_ahead()` when sphere intersection fails. +- On `AHEAD NOT VALID`, advance `virtual_time` by `time_multiplier × elapsed_sim_time`. +- Throttled `WARN` instead of per-tick logging. + +### 3. Missing waypoint times on merge (`trajectory_library.cpp`) + +`Trajectory::merge()` into an empty trajectory now calls `generate_waypoint_times()`. + +### 4. Parameter tuning + +`sphere_radius` increased from `1.0` → `2.0` in both `local.launch.xml` and `local_droan_cpu.launch.xml`. + +--- + +## Manual stack usage (without pytest) + +To fly a fixed trajectory interactively: + +```bash +cd /path/to/AirStack + +# Bring up Isaac Sim + robot (1 robot, headless) +COMPOSE_PROFILES=isaac-sim NUM_ROBOTS=1 airstack up + +# Takeoff (optional — or use RViz task panel) +docker exec -it airstack-robot-desktop-1 bash -c ' + source /opt/ros/jazzy/setup.bash && + source /root/AirStack/robot/ros_ws/install/setup.bash && + ros2 action send_goal /robot_1/tasks/takeoff task_msgs/action/TakeoffTask \ + "{target_altitude_m: 10.0, velocity_m_s: 1.0}" +' + +# Circle trajectory +docker exec -it airstack-robot-desktop-1 bash -c ' + source /opt/ros/jazzy/setup.bash && + source /root/AirStack/robot/ros_ws/install/setup.bash && + ros2 action send_goal --feedback /robot_1/tasks/fixed_trajectory \ + task_msgs/action/FixedTrajectoryTask \ + "{trajectory_spec: {type: Circle, attributes: [{key: radius, value: \"10.0\"}, {key: velocity, value: \"2.0\"}]}, loop: false}" +' + +# Land +docker exec -it airstack-robot-desktop-1 bash -c ' + source /opt/ros/jazzy/setup.bash && + source /root/AirStack/robot/ros_ws/install/setup.bash && + ros2 action send_goal /robot_1/tasks/land task_msgs/action/LandTask \ + "{velocity_m_s: 1.0}" +' + +airstack down +``` + +Action server: `/{robot_name}/tasks/fixed_trajectory` — see also [Tasks and Task Executors](../../../robot/autonomy/tasks.md). + +--- + +## Troubleshooting + +| Symptom | Likely cause | Fix | +| ------- | ------------ | --- | +| Sentinel nodes missing | Workspace not built in container | `-m "build_packages or autonomy"` | +| PX4 ready timeout | Sim not running, GPU issue | Check `nvidia-smi`, Isaac `omni_pass.env` | +| `trajectory_success = 0` | Tracker stall or timeout | Check trajectory_controller logs; verify bug fixes applied | +| Cross-track error >> 5 m | Wrong tracker params or frame bug | Compare launch params; check world-frame transform | +| Tests run for hours | Default `--sim` and `--num-robots` sweep | Pin `--sim isaacsim --num-robots 1 --stress-iterations 1` | +| Unknown mark warning `autonomy` | Mark not in `pytest.ini` | Harmless; filter still works | + +--- + +## Source file reference + +| File | Role | +| ---- | ---- | +| [`tests/test_fixed_trajectory.py`](../../../../tests/test_fixed_trajectory.py) | Test module, ideal paths, metrics | +| [`tests/conftest.py`](../../../../tests/conftest.py) | Fixtures, `--trajectory-types`, summary hook, collection order | +| [`tests/run_summary.py`](../../../../tests/run_summary.py) | `summary.txt` generator | +| [`tests/parse_metrics.py`](../../../../tests/parse_metrics.py) | Markdown reports + regression diff | +| [`tests/pytest.ini`](../../../../tests/pytest.ini) | Registered marks | +| [`robot/.../fixed_trajectory_task.cpp`](../../../../robot/ros_ws/src/local/controls/trajectory_controller/src/fixed_trajectory_task.cpp) | C++ reference path generators | +| [`robot/.../trajectory_controller.cpp`](../../../../robot/ros_ws/src/local/controls/trajectory_controller/src/trajectory_controller.cpp) | Pure-pursuit path tracker | +| [`robot/.../trajectory_library.cpp`](../../../../robot/ros_ws/src/local/planners/trajectory_library/src/trajectory_library.cpp) | Trajectory math, sphere intersection | +| [`robot/.../local.launch.xml`](../../../../robot/ros_ws/src/local/local_bringup/launch/local.launch.xml) | Tracker + PID params | + +--- + +## Related documentation + +- [System tests overview (`tests/README.md`)](../../../../tests/README.md) +- [Trajectory Controller README](../../../../robot/ros_ws/src/local/controls/trajectory_controller/README.md) +- [Tasks and Task Executors](../../../robot/autonomy/tasks.md) +- [CI/CD orchestrator](../../../../tests/ci-cd-orchestrator.md) diff --git a/docs/development/intermediate/testing/index.md b/docs/development/intermediate/testing/index.md index 94cfd7d77..ec563045f 100644 --- a/docs/development/intermediate/testing/index.md +++ b/docs/development/intermediate/testing/index.md @@ -1 +1,41 @@ -# Testing \ No newline at end of file +# Testing + +AirStack uses several test layers: ROS 2 package tests (`colcon test`), and **system tests** under [`tests/`](../../../../tests/) at the repo root (pytest, full Docker stack). + +## System tests (`tests/`) + +The canonical reference is **[`tests/README.md`](../../../../tests/README.md)** (also included in the MkDocs site). In short: + +| Mark | Module | Role | +|------|--------|------| +| `build_docker` | `test_build_docker.py` | Docker image builds | +| `build_packages` | `test_build_packages.py` | `colcon build` inside containers | +| `liveliness` | `test_liveliness.py` | Containers, `/clock` readiness, tmux, sentinel ROS 2 nodes, compute, infra-only stability poll | +| `sensors` | `test_sensors.py` | Sim + robot stereo/depth Hz, filtered LiDAR (`echo --once` + validation script on Isaac), sim RTF, sensor stability time-series | +| `takeoff_hover_land` | `test_takeoff_hover_land.py` | Four-phase flight chain per configuration (takeoff → hover → land) | +| `autonomy` | `test_fixed_trajectory.py` | Fixed-pattern path-tracker benchmark (takeoff → trajectory → land) | + +Collection order is defined in `tests/conftest.py` (`build_docker` → `build_packages` → `liveliness` → `sensors` → `takeoff_hover_land` → `test_fixed_trajectory`). Each mark's test **class** uses **class-scoped** `airstack_env`, so combining marks with **`or`** runs multiple full stack bring-ups per `(sim, num_robots, iteration)` — see *Bring-up scope* in `tests/README.md`. + +**Isaac Sim:** the `sensors` implementation batches `ros2 topic hz` on sim and robot paths and avoids `hz` on filtered `PointCloud2`; pytest enables `ENABLE_LIDAR` for the multi-drone Pegasus script. Details: **`tests/README.md`** → *Isaac Sim and the sensors mark*. + +### Fixed-trajectory path-tracker benchmark + +For the full guide — purpose, metrics, CLI, comparing trackers, bug fixes, and baselines — see **[Fixed-Trajectory Path-Tracker Benchmark](fixed_trajectory_testing.md)**. + +Quick smoke test: + +```bash +airstack test -m "build_packages or autonomy" \ + --sim isaacsim \ + --num-robots 1 \ + --stress-iterations 1 \ + --trajectory-types Circle \ + -v +``` + +## Other testing docs + +- [Testing frameworks](testing_frameworks.md) — `colcon test`, rostest patterns +- [Integration testing](integration_testing.md) +- [CI/CD](ci_cd.md) — pipeline overview diff --git a/docs/gcs/foxglove.md b/docs/gcs/foxglove.md new file mode 100644 index 000000000..ec2c6dd90 --- /dev/null +++ b/docs/gcs/foxglove.md @@ -0,0 +1,149 @@ +# GCS Foxglove Visualization + +The GCS runs a **Foxglove Studio** browser interface backed by a single ROS 2 node — `foxglove_visualizer_node` — that gathers per-robot data from the cross-domain bridge and republishes it on a small set of GCS-side topics. Foxglove subscribes to those topics and shows the fleet in 3D. + +This page describes what the node visualizes today, the topic naming convention, and where to edit when you want to change or add a marker type. For the gossip payload visualization (filtered rays, voxel maps, etc.) see [Coordination Payloads](../robot/autonomy/coordination/payloads.md). + +![Full GCS Foxglove view — overhead-textured 3D panel on top, Robot Tasks panel and per-robot camera + depth feeds along the bottom](foxglove_full_screen.png) + +## Connecting to Foxglove and loading the custom layout + +The GCS container regenerates `/root/airstack_layout_num_robots_.json` on every startup, where `` is the current `NUM_ROBOTS`, using `gcs/foxglove_extensions/airstack_default.json` as the single-robot template (see `gcs/foxglove_extensions/render_layout.py`). The file lives only in the container — it's regenerated on startup and disappears on removal. + +To use the locally-rendered, `NUM_ROBOTS`-matched layout: + +1. In the Foxglove dashboard, click **Layouts** → **Import from file...**. +2. The file browser opens in `/root/` by default — select the `airstack_layout_num_robots_.json` matching your `NUM_ROBOTS`. +3. Back on the dashboard, click **Open connection** and enter: + - `ws://localhost:8765` if Foxglove is running inside the GCS container + - `ws://localhost:8766` if Foxglove is running on the host +4. In the top-right corner, click the current layout name and select the imported layout from the dropdown. + +Foxglove keeps the imported layout in its IndexedDB and re-activates it on subsequent launches — re-import only when you change `NUM_ROBOTS` or edit the template. + +## What gets visualized + +The visualizer auto-discovers any robot whose topics match the AirStack convention (default prefix: `robot`). For each discovered robot it subscribes to a fixed set of suffixes: + +| Suffix | Type | What it becomes on the GCS | +|---|---|---| +| `/interface/mavros/global_position/global` | `NavSatFix` | Robot location pin on the Map panel | +| `/odometry_conversion/odometry` | `Odometry` | Body-frame pose / orientation arrow | +| `/trajectory_controller/trajectory_vis` | `MarkerArray` | Live executing trajectory | +| `/global_plan` | `Path` | Global plan polyline | +| `/vdb_mapping/vdb_map_visualization` | `Marker` | Per-robot VDB occupancy mesh | + +All of these are published by individual robots in their **local `map` frame** (origin = drone boot position). The visualizer translates them into a single global `map` frame on the GCS using each robot's GPS boot offset, and merges everything into one `MarkerArray`. + +## Output topics + +| Topic | Type | What it carries | +|---|---|---| +| `/gcs/robot_markers` | `MarkerArray` | Combined per-robot markers (mesh, trajectory, plan, VDB) in global ENU | +| `/gcs/{robot_name}/location` | `NavSatFix` | Per-robot GPS rewritten to `frame_id='map'` — Foxglove's Map panel only accepts it that way | +| `/gcs/map_origin/location` | `NavSatFix` | Stationary fix at the configured `ORIGIN_LAT/LON` so the Map panel has a fixed reference | +| `/gcs/sim_ground` | `Marker` | Sim overhead-camera output rendered as a textured ground plane (sim only) | +| `/gcs/payload/{robot}/{name}` | varies | Per-robot gossip-payload republish (one topic per registered handler) | + + +## Discovery loop + +`_discover_robots` runs every 5 seconds. It calls `get_topic_names_and_types()`, regex-matches each suffix above, and creates a subscription if it sees a topic it doesn't already track. Robots that come online late are picked up on the next tick. + +To change which prefix is matched (e.g. you renamed robots from `robot_*` to `drone_*`), set the `robot_name_prefix` parameter on the visualizer node. + +## How to modify or add a marker type + +The visualizer is designed to be extended in-place. The pattern, taken from `gcs/ros_ws/src/gcs_visualizer/gcs_visualizer/foxglove_visualizer_node.py`: + +### 1. Add a suffix and regex + +```python +PLAN_SUFFIX = '/global_plan' +self._plan_pattern = re.compile(rf'^/({re.escape(self._prefix)}_\w+){re.escape(PLAN_SUFFIX)}$') +``` + +### 2. Add state + +```python +self._global_plans = {} # robot_name -> latest msg +self._subscribed_plan = set() +``` + +### 3. Subscribe in `_discover_robots` + +```python +if topic not in self._subscribed_plan: + m = self._plan_pattern.match(topic) + if m and 'nav_msgs/msg/Path' in type_list: + name = m.group(1) + self.create_subscription( + Path, topic, + lambda msg, n=name: self._plan_callback(msg, n), + 10, # 10 = default RELIABLE for planning topics; + # SENSOR_QOS for high-rate sensor streams + ) + self._subscribed_plan.add(topic) +``` + + +### 4. Add a callback + +```python +def _plan_callback(self, msg: Path, robot_name: str): + self._global_plans[robot_name] = msg +``` + +### 5. Render in `_publish_markers` + +```python +plan = self._global_plans.get(robot_name) +boot = self._gps_boot.get(robot_name) +if plan is not None and boot is not None: + bx, by, bz = boot + line = Marker() + line.header.frame_id = 'map' + line.ns = f'{robot_name}_global_plan' + line.type = Marker.LINE_STRIP + for ps in plan.poses: + p = ps.pose.position + line.points.append(Point(x=p.x + bx, y=p.y + by, z=p.z + bz)) + array.markers.append(line) +``` + + +### 6. Bridge the source topic across DDS domains + +The visualizer can only subscribe to topics that crossed the DDS bridge. Add the source topic to `robot/ros_ws/src/autonomy_bringup/onboard_all/config/dds_router.yaml` under `allowlist`: + +```yaml +allowlist: + - name: "rt/$(env ROBOT_NAME)/your/new_topic" +``` + +Then restart the robot containers — the router only re-reads its allowlist on startup. + +## Bridging a topic without writing a callback + +If your topic is already in a Foxglove-native type (`nav_msgs/Path`, `sensor_msgs/PointCloud2`, `visualization_msgs/MarkerArray`) and doesn't need the GPS offset, you can skip the visualizer entirely — just bridge it through the DDS router and add a panel in Foxglove pointing at the topic. The visualizer is only required when you need georeferencing or want everything to flow through the combined `/gcs/robot_markers` namespace. + +## Sim-only: textured overhead ground + +When running in sim, the visualizer also subscribes to `/sim/overhead/image` + `/sim/overhead/spec`. On receiving both, it builds one `TRIANGLE_LIST` marker on `/gcs/sim_ground` (latched) and tears down its subscriptions. See [2D World Map in Foxglove](../simulation/isaac_sim/overhead_camera.md) for the producer side. + +## Troubleshooting + +| Symptom | Likely cause | +|---|---| +| Robot doesn't appear at all | Source topic isn't in the DDS router allowlist, or the GPS topic isn't publishing yet | +| Robot appears at the wrong global location | First GPS fix had wrong altitude datum, or PX4 home wasn't set (sim) | +| Markers double-offset (visibly twice as far from where they should be) | Both `pose.position` and `points[]` were offset in the render loop | +| New marker added but never shows up | Discovery hasn't fired yet (5 s interval), or topic name doesn't match the regex | +| Foxglove "frame `map` does not exist" | The static `world → map` TF didn't reach Foxglove — restart the GCS container | + +## See also + +- [Coordination Payloads](../robot/autonomy/coordination/payloads.md) — extending visualization with gossip-broadcast payloads +- [Adding Waypoints and Geofences](waypoints_and_geofences.md) — interactive click-to-place editors +- [Overhead Camera](../simulation/isaac_sim/overhead_camera.md) — sim-side ground texture producer +- [`.agents/skills/visualize-in-foxglove`](../../.agents/skills/visualize-in-foxglove/SKILL.md) — agent workflow for adding a topic diff --git a/docs/gcs/foxglove_full_screen.png b/docs/gcs/foxglove_full_screen.png new file mode 100644 index 000000000..cf1e9ef13 Binary files /dev/null and b/docs/gcs/foxglove_full_screen.png differ diff --git a/docs/gcs/foxglove_publish_point.png b/docs/gcs/foxglove_publish_point.png new file mode 100644 index 000000000..6508d1bf1 Binary files /dev/null and b/docs/gcs/foxglove_publish_point.png differ diff --git a/docs/gcs/polygon_editor.png b/docs/gcs/polygon_editor.png new file mode 100644 index 000000000..7407bb213 Binary files /dev/null and b/docs/gcs/polygon_editor.png differ diff --git a/docs/gcs/waypoint_editor.png b/docs/gcs/waypoint_editor.png new file mode 100644 index 000000000..774625e15 Binary files /dev/null and b/docs/gcs/waypoint_editor.png differ diff --git a/docs/gcs/waypoints_and_geofences.md b/docs/gcs/waypoints_and_geofences.md new file mode 100644 index 000000000..530235df5 --- /dev/null +++ b/docs/gcs/waypoints_and_geofences.md @@ -0,0 +1,87 @@ +# Adding Waypoints and Geofences + +The GCS has two click-to-place panels in Foxglove: + +- **Waypoint Editor** — drop ordered 3D waypoints for the Navigate task. +- **Polygon Editor** — drop vertices of a 2D area to use as a **geofence** / search bounds for the Exploration and Coverage tasks. + +Both panels work the same way: enable click capture then click in the 3D panel to place points. + +![Robot Tasks panel — Navigate tab with the embedded Waypoint Editor](waypoint_editor.png) + +The two editors live inside the **Robot Tasks** panel — the Waypoint Editor appears under the **Navigate** tab, and the Polygon Editor appears under the **Exploration** and **Coverage** tabs (where it feeds the `search_bounds` field). + + + +*End-to-end demo: enabling click capture, dropping points, saving the set, then sending it to a robot.* + +## Place points + +1. In the editor panel, toggle **Enable click capture** on. +2. (Optional) In the top right of the **3D** panel, switch the camera to a top-down view to make it easier to drop points on the ground plane. +3. In the 3D panel toolbar, click the **Publish** tool (top right, ▷ icon) and switch its mode to **Publish 2D point (/clicked_point)** — this is what sends clicks to the editor. + + ![3D-mode toggle and the Publish 2D point option in the Foxglove 3D panel toolbar](foxglove_publish_point.png){ width="380" } + +4. Click anywhere in the 3D panel. A red marker appears at the click location. The waypoint editor draws spheres in click order; the polygon editor draws a closed loop. +5. The **Default altitude** field controls the `z` coordinate that gets attached to each click — set it once, then click freely on the ground. + +To add a point without clicking — e.g. for a precise coordinate — type values into the **+ Add** row and press Enter. + +## Reorder, edit, delete or duplicate + +- **Reorder** — drag a row up or down in the active list. The marker numbering updates immediately. +- **Edit a point** — click the row, edit the `x` / `y` / `z` fields, press Enter. +- **Delete a point** — click the ✕ on the row. +- **Clear all** — click **Clear**. Doesn't touch saved sets. +- **Duplicate** — click the ⧉ icon on a row to insert a copy of that point directly after it. Useful for laying down repeated patterns like a survey grid where each new point is a small offset from the previous one. + +For polygons specifically, vertex order defines the perimeter — reorder rows to flip the polygon shape. + + +## Save and load + +Saves let you name a set of points and bring them back later — the Robot Tasks panel reads the same saves, so a saved waypoint set can be selected as a Navigate target and a saved polygon can be selected as `search_bounds` for Exploration / Coverage. + +Two-step save flow: + +1. Type a name into the **save name…** field, then click **+ Add**. The save now exists in memory and shows up in the saves list. +2. Click **Save** on that row to persist it to disk. The button changes to **✓ Saved** when the file write succeeds. + +Other actions per saved row: + +- **Load** — replaces the active list with the saved one. Useful for re-editing a previously persisted set. +- **Delete** — removes the save from both memory and disk. + +Saves are written to host-mounted JSON files inside the GCS container: + +| Editor | File | +|---|---| +| Waypoints | `~/.airstack/gcs_waypoint_saves.json` | +| Polygons | `~/.airstack/gcs_polygon_saves.json` | + +These survive container restarts and can be hand-edited or version-controlled if you want to ship a curated mission set. + +## Use them in a task + +- **Navigate** — in the Robot Tasks panel, select the **Navigate** tab, pick a saved waypoint set from the **from:** dropdown (or **active** to use whatever's currently in the editor), pick a robot, click **Send**. The **Grab** button copies the current selection into the JSON `waypoints` field below. +- **Exploration** / **Coverage** — same flow, but the polygon save fills the `search_bounds` field. + +![Robot Tasks panel — Exploration tab with `search_bounds` populated from the Polygon Editor](polygon_editor.png) + +If a save doesn't appear in the dropdown after creating it, click the dropdown again to refresh — the Tasks panel re-reads the latched saves topic on focus. + +## Troubleshooting + +| Symptom | Likely cause | +|---|---| +| Clicks don't register | Click capture isn't enabled, or the Publish tool isn't set to **Click position** | +| Clicks register but no marker shows | The 3D panel doesn't have the editor's marker topic enabled — open its Topics list and toggle on `/gcs/waypoints/markers` (or `/gcs/polygon/markers`) | +| Saves don't persist | Host volume `~/.airstack` not mounted on the GCS container | +| Save name silently overwrites | Both **+ Add** and **Save** overwrite by name — pick a unique name | +| Tasks panel doesn't see a new save | Re-open the dropdown to refresh, or restart the panel | + +## See also + +- [GCS Foxglove Visualization](foxglove.md) — the multi-robot fleet view alongside these editors +- [Coordination Payloads](../robot/autonomy/coordination/payloads.md) — for sharing custom data fleet-wide diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md index e7bb64235..3b319ec00 100644 --- a/docs/getting_started/index.md +++ b/docs/getting_started/index.md @@ -1,5 +1,12 @@ # Getting Started +!!! tip "On Mac, Windows, or no GPU?" + + This page assumes a Linux desktop with an NVIDIA GPU. If that's not you, + use [AirStack on OSMO](../tutorials/airstack_on_osmo.md) instead — you + only need an SSH key, the `osmo` CLI, and VS Code or Cursor. No local + Docker, no NVIDIA drivers, no `airstack install`. + !!! warning "" AirStack is currently in ALPHA and only meant for internal usage. diff --git a/docs/robot/autonomy/coordination/index.md b/docs/robot/autonomy/coordination/index.md new file mode 100644 index 000000000..cabff92fd --- /dev/null +++ b/docs/robot/autonomy/coordination/index.md @@ -0,0 +1,66 @@ +# Coordination + +The coordination layer lets drones share state with each other and the GCS without a central broker. Each robot periodically broadcasts a `PeerProfile` — its GPS position, heading, current waypoint, and any custom data payloads — over a shared DDS domain. Every robot and the GCS receives every other robot's profile directly. + +## Architecture + +``` +Robot 1 (domain 1) Shared gossip domain (99) GCS (domain 0) +┌──────────────────┐ ┌─────────────────────┐ ┌──────────────┐ +│ gossip_node │──────▶ │ /gossip/peers │ ──────▶│ GCS │ +│ publishes own │ │ │ │ visualizer │ +│ PeerProfile │ ◀────── │ (all robots + GCS │ └──────────────┘ +│ │ │ subscribe here) │ +│ local registry │ └─────────────────────┘ +│ (read-only) │ +└──────────────────┘ + ▲ + DDS Router + bridges domain 1 + ↔ domain 99 +``` + +Each robot builds a local registry of all known peers from incoming messages. The registry never leaves the robot — only each drone's own profile is transmitted. + +## PeerProfile + +Every message on `/gossip/peers` is a `PeerProfile` containing: + +| Field | Type | Description | +|---|---|---| +| `robot_name` | string | Unique robot identifier | +| `gps_fix` | NavSatFix | Current GPS position (also used as message ID for dedup) | +| `heading` | float64 | Compass heading, degrees CW from North | +| `waypoint` | PoseStamped | Current navigation goal (all-zeros = no plan) | +| `payloads` | PeerProfilePayload[] | Arbitrary serialized ROS messages | +| `source` | uint8 | `0` = direct, `1` = relayed (reserved) | +| `relay_hops` | uint8 | Hop count (reserved) | + +## Launch + +Coordination is included in the main autonomy bringup automatically. To launch standalone: + +```bash +ros2 launch coordination_bringup gossip.launch.xml +``` + +Key parameters: + +| Parameter | Default | Description | +|---|---|---| +| `robot_name` | `$ROBOT_NAME` | Robot identifier and topic namespace | +| `publish_rate` | `1.0` | Publish rate in Hz (wall-clock) | +| `gossip_domain` | `99` | Shared DDS domain | + +## Monitoring + +```bash +# Live peer registry in the terminal +ROS_DOMAIN_ID=99 ros2 run coordination_bringup peer_registry_monitor + +# Filter to one robot +ROS_DOMAIN_ID=99 ros2 run coordination_bringup peer_registry_monitor --robot robot_1 + +# Inspect raw messages +ros2 topic echo /gossip/peers +``` diff --git a/docs/robot/autonomy/coordination/payloads.md b/docs/robot/autonomy/coordination/payloads.md new file mode 100644 index 000000000..f1126c065 --- /dev/null +++ b/docs/robot/autonomy/coordination/payloads.md @@ -0,0 +1,111 @@ +# Payloads & Foxglove Visualization + +Payloads let you attach any ROS message to the `PeerProfile` so it gets broadcast to all peers and the GCS alongside GPS/heading. Common uses: sharing maps, frontier viewpoints, semantic rays, or any per-robot data you want visible fleet-wide. + +Payloads are **config-driven** — no changes to `gossip_node.py` are needed. + +## How payloads work + +1. `gossip_node` subscribes to each topic listed in `gossip_payloads.yaml` +2. On each 1 Hz publish tick, the latest message from each topic is serialized and attached to the `PeerProfile` +3. Before attaching, the payload is transformed from the robot's local odom frame → global ENU using the robot's boot GPS position +4. Peers and GCS receive the payload already in world frame — no transform needed on the receiving side + +## Step 1 — Add to gossip_payloads.yaml + +**File:** `common/ros_packages/coordination/coordination_bringup/config/gossip_payloads.yaml` + +```yaml +payload_topics: + - topic: "/{robot_name}/your/topic" + type: "your_msgs/msg/YourType" +``` + +- `{robot_name}` is substituted at runtime (e.g. → `/robot_1/your/topic`) +- Topics that haven't published yet are silently skipped +- Only `MarkerArray` and `PointCloud2` are automatically transformed to world frame; other types pass through as-is + +Rebuild after editing: + +```bash +bws --packages-select coordination_bringup +``` + +Verify the payload is being attached: + +```bash +ros2 topic echo /gossip/peers --field payloads +# or +ros2 run coordination_bringup peer_registry_monitor +``` + +## Step 2 — Visualize in Foxglove + +Payloads don't appear in Foxglove automatically — you need a handler in `payload_visualizer_node.py` that republishes the payload to its own topic. The manual steps are below; an [AI-native skill](#ai-native-skill) is also available to do both this and Step 1 in one go. + +**File:** `gcs/ros_ws/src/gcs_visualizer/gcs_visualizer/payload_visualizer_node.py` + +**1. Add a handler method:** + +```python +def _handle_your_payload(self, robot_name, msg, i, now): + # msg is already in global ENU / 'map' frame + # Apply display z-offset to align with the GCS datum + out = transform_point_cloud2(msg, 0.0, 0.0, self._display_z_offset()) + out.header.stamp = now + self._pub_for(f'/gcs/payload/{robot_name}/your_name', PointCloud2).publish(out) +``` + +**2. Register it in `PAYLOAD_HANDLERS`:** + +```python +PAYLOAD_HANDLERS = { + 'your_name': ('your_msgs/msg/YourType', _handle_your_payload), +} +``` + +**3. Rebuild GCS:** + +```bash +docker exec airstack-gcs-1 bash -c "bws --packages-select gcs_visualizer && sws" +``` + +or restart AirStack. + +Foxglove will now show `/gcs/payload/{robot_name}/your_name` as a subscribable topic with full visualization controls. + +### AI-native skill + +The `attach-gossip-payload` skill automates Step 1 and Step 2 — yaml edit and the GCS handler in a single pass. In Claude Code: + +``` +Follow the attach-gossip-payload skill to add /{robot_name}/your/topic +of type your_msgs/msg/YourType and visualize it in Foxglove +``` + +See the full skill at `.agents/skills/attach-gossip-payload`. + +## Visualization options + +For `PointCloud2` payloads, you have two options: + +**Default — Foxglove GUI:** Publish as raw. Foxglove's panel settings control point size, shape, and color per-user. No code changes needed. + +**Preconfigured — fixed shape/size/color in code:** Convert to a `MarkerArray` in the handler. An example that renders a point cloud as 0.5 m cubes with per-point RGB colors: + +```python +def _handle_my_payload(self, robot_name, msg, i, now): + marker = point_cloud2_to_cube_marker( + msg, 0.0, 0.0, self._display_z_offset(), + ns=f'{robot_name}_my_payload', + marker_id=i * 100000, + stamp=now, + lifetime=Duration(sec=2, nanosec=0), + fallback_color=None, # uses per-point rgb field; set to (r, g, b, a) for a solid color + scale=0.5, # cube size in meters + ) + if marker is not None: + out = MarkerArray() + out.markers.append(marker) + self._pub_for(f'/gcs/payload/{robot_name}/my_payload', MarkerArray).publish(out) +``` diff --git a/docs/robot/autonomy/sensors/index.md b/docs/robot/autonomy/sensors/index.md index eb1958fba..a5142333a 100644 --- a/docs/robot/autonomy/sensors/index.md +++ b/docs/robot/autonomy/sensors/index.md @@ -1,8 +1,69 @@ -We'll fill this with different things like the ZED-X package, LiDAR, etc +# Sensor Packages +The **sensors** layer holds ROS 2 nodes that sit next to hardware or simulation bridges: light preprocessing, remapping, and calibration helpers so **perception** and downstream layers see stable topics. + +## Overview + +The sensors layer is responsible for: + +- **Bridged sensor topics**: Normalizing names and QoS for data coming from Isaac Sim, MAVROS, or onboard drivers +- **Preprocessing**: Near-range LiDAR cleanup and similar filters that should run on the robot graph, not only in simulation +- **Supporting documentation**: Patterns for optional tools such as the gimbal extension in simulation ## Launch -Launch files are under `src/robot/autonomy/sensors/sensors_bringup/launch`. -The main launch command is `ros2 launch sensors_bringup sensors.launch.xml`. +Launch files are located under `robot/ros_ws/src/sensors/sensors_bringup/launch/`. + +The main launch command is: + +```bash +ros2 launch sensors_bringup sensors.launch.xml +``` + +The bringup group uses the `sensors` namespace under each robot; see that package for which nodes are started. + +## Key Topics + +### Outputs + +- `/{robot_name}/sensors/ouster/point_cloud` — Filtered `sensor_msgs/msg/PointCloud2` (xyz) after `lidar_point_cloud_filter`, when using the default Ouster-style names in config + +### Inputs + +- `/{robot_name}/sensors/ouster/point_cloud_raw` — Raw cloud from the simulator or driver (typical input to the LiDAR filter) +- Other hardware- or bridge-specific topics as wired in `sensors_bringup` + +Topic strings are parameterized with `$(env ROBOT_NAME)` in YAML; override `input_topic` / `output_topic` in the filter config if your stack uses different names. + +## Modules + +- [**LiDAR point cloud filter**](#lidar-point-cloud-filter) (`lidar_point_cloud_filter`) — near-range sphere filter for `PointCloud2` +- [**Gimbal stabilizer**](gimbal.md) — gimbal extension usage in simulation + +## LiDAR point cloud filter (`lidar_point_cloud_filter`){#lidar-point-cloud-filter} + +**Package:** `robot/ros_ws/src/sensors/lidar_point_cloud_filter` + +**Role:** Subscribe to a **raw** `sensor_msgs/msg/PointCloud2`, drop points whose distance from the cloud origin is below `near_range_m` (typical self-hit / multipath noise near the sensor), and republish a **clean xyz float32** cloud for mapping, exploration, RViz, and VDB. + +**Why it exists:** Isaac Sim’s RTX OmniLidar path exposes a **`min_range` / `nearRangeM`** hook in Pegasus (`spawn_rtx_lidar.py`), but applying near range **inside the simulator is unreliable** across Kit builds (attribute missing or ineffective). That behavior is documented as a **known limitation** in [Pegasus / Isaac Sim setup](../../../simulation/isaac_sim/pegasus_scene_setup.md#rtx-lidar-near-range). **AirStack’s supported approach** is to run this **robot-side** filter so the stack always sees a consistent filtered topic. + +**Defaults (configurable):** + +- Parameters and defaults: `robot/ros_ws/src/sensors/lidar_point_cloud_filter/config/lidar_point_cloud_filter.yaml` +- Typical topics: `/{robot_name}/sensors/ouster/point_cloud_raw` → `/{robot_name}/sensors/ouster/point_cloud` (override `input_topic` / `output_topic` if your bridge uses different names, for example under `sensors/lidar/...`) +- QoS: `qos_reliable` defaults to **true** to match common Isaac bridges and RViz; see the package README + +**Further detail:** `robot/ros_ws/src/sensors/lidar_point_cloud_filter/README.md` + +## Configuration + +- **Bringup:** `robot/ros_ws/src/sensors/sensors_bringup/config/` and launch XML under `sensors_bringup/launch/` +- **LiDAR filter:** `robot/ros_ws/src/sensors/lidar_point_cloud_filter/config/lidar_point_cloud_filter.yaml` (`near_range_m`, topics, QoS) + +## See Also +- [System Architecture](../system_architecture.md) — overall autonomy stack architecture +- [Perception](../perception/index.md) — downstream consumers of filtered sensor data +- [Integration Checklist](../integration_checklist.md) — adding new sensor-layer packages +- [Pegasus / Isaac Sim — RTX LiDAR and near range](../../../simulation/isaac_sim/pegasus_scene_setup.md#rtx-lidar-near-range) — simulation-side `min_range` limitation and why the filter runs on the robot diff --git a/docs/robot/autonomy/tasks.md b/docs/robot/autonomy/tasks.md index cc372bc23..20345ea92 100644 --- a/docs/robot/autonomy/tasks.md +++ b/docs/robot/autonomy/tasks.md @@ -2,7 +2,7 @@ ## Overview -Task executors are ROS 2 action servers that carry out discrete, goal-directed missions for the drone. Unlike perpetual nodes (state estimation, controllers, world models) that run continuously from launch to shutdown, a task executor only does work when an action client sends it a goal. The caller receives streaming feedback while the task runs, and a final result when it completes or is cancelled. +Task executors are ROS 2 action servers that carry out discrete, goal-directed missions for the drone. Unlike perpetual nodes (state estimation, controllers, world models) that run continuously from launch to shutdown, a task executor only does work when an action client sends it a goal. The caller receives streaming feedback while the task runs, and a final result when it completes or is canceled. See [System Architecture — Node Types](system_architecture.md#node-types-perpetual-vs-task-executor) for the broader context. @@ -47,20 +47,20 @@ geometry_msgs/Point current_position **Action server:** `/{robot_name}/tasks/fixed_trajectory` **Implemented by:** *(not yet implemented)* -Follow a pre-defined trajectory specified by shape type and parameters. With `loop: true`, the trajectory repeats until the task is cancelled. +Follow a pre-defined trajectory specified by shape type and parameters. With `loop: true`, the trajectory repeats until the task is canceled. #### Goal | Field | Type | Description | | ----- | ---- | ----------- | | `trajectory_spec` | airstack_msgs/FixedTrajectory | Trajectory type (e.g. `Circle`, `Figure8`, `Lawnmower`) and key-value attributes | -| `loop` | bool | If true, repeat trajectory indefinitely until cancelled | +| `loop` | bool | If true, repeat trajectory indefinitely until canceled | #### Result | Field | Type | Description | | ----- | ---- | ----------- | -| `success` | bool | True if trajectory completed (or cancelled normally when looping); false on error | +| `success` | bool | True if trajectory completed (or canceled normally when looping); false on error | | `message` | string | Completion reason | #### Feedback @@ -92,8 +92,8 @@ Navigate along a global plan path using local obstacle avoidance. This is the lo | Field | Type | Description | | ----- | ---- | ----------- | -| `success` | bool | True if the drone reached the goal; false if cancelled or error | -| `message` | string | `"Goal reached"`, `"Cancelled"`, or `"Node shutting down"` | +| `success` | bool | True if the drone reached the goal; false if canceled or error | +| `message` | string | `"Goal reached"`, `"Canceled"`, or `"Node shutting down"` | #### Feedback @@ -111,7 +111,7 @@ Navigate along a global plan path using local obstacle avoidance. This is the lo **Action server:** `/{robot_name}/tasks/exploration` **Implemented by:** `random_walk_planner` -Explore an area using random or systematic flight patterns. The task runs until the time limit is reached or it is cancelled; there is no natural spatial completion condition. +Explore an area using random or systematic flight patterns. The task runs until the time limit is reached or it is canceled; there is no natural spatial completion condition. #### Goal @@ -128,8 +128,8 @@ Explore an area using random or systematic flight patterns. The task runs until | Field | Type | Description | | ----- | ---- | ----------- | -| `success` | bool | True if time limit reached normally; false if cancelled or error | -| `message` | string | `"Time limit reached"`, `"Task cancelled"`, or error description | +| `success` | bool | True if time limit reached normally; false if canceled or error | +| `message` | string | `"Time limit reached"`, `"Task canceled"`, or error description | #### Feedback @@ -173,7 +173,7 @@ Perform a systematic lawnmower-pattern coverage survey of a polygonal area. Comp | Field | Type | Description | | ----- | ---- | ----------- | -| `success` | bool | True if area fully covered; false if cancelled | +| `success` | bool | True if area fully covered; false if canceled | | `message` | string | Completion reason | | `coverage_percentage` | float32 | Fraction of area covered at task end (0–100) | @@ -214,7 +214,7 @@ Search an area for a target described in natural language. Uses a vision-languag | Field | Type | Description | | ----- | ---- | ----------- | -| `success` | bool | True if at least one match above threshold was found; false if cancelled or not found | +| `success` | bool | True if at least one match above threshold was found; false if canceled or not found | | `message` | string | Completion reason | | `found_poses` | geometry_msgs/PoseArray | Poses of all matches found | | `confidence` | float32 | Confidence of the best match (0.0–1.0) | diff --git a/docs/robot/docker/index.md b/docs/robot/docker/index.md index e344996a1..db2fd8bcd 100644 --- a/docs/robot/docker/index.md +++ b/docs/robot/docker/index.md @@ -103,5 +103,6 @@ Key variables are set in the project's `.env` file and forwarded into the contai | `ROBOT_LAUNCH_PACKAGE` / `ROBOT_LAUNCH_FILE` | Top-level ROS 2 launch target | | `OFFBOARD_BASE_PORT` / `ONBOARD_BASE_PORT` | MAVLink UDP port base values (desktop/sim only) | | `ROBOT_NAME_MAP_CONFIG_FILE` | YAML mapping config used to resolve a name to `ROBOT_NAME` and `ROS_DOMAIN_ID` (default: `default_robot_name_map.yaml`) | +| `DEBUG_RVIZ` | If `true`, launches RViz alongside the robot via `desktop_bringup/robot.launch.xml` (default: `false`) | See [Robot Identity](robot_identity.md) for how `ROBOT_NAME` and `ROS_DOMAIN_ID` are derived automatically inside the container from these profiles. diff --git a/docs/simulation/isaac_sim/docker.md b/docs/simulation/isaac_sim/docker.md index 190651872..74edf49f8 100644 --- a/docs/simulation/isaac_sim/docker.md +++ b/docs/simulation/isaac_sim/docker.md @@ -87,7 +87,7 @@ command: > tmux new -d -s isaac; if [ $$AUTOLAUNCH = 'true' ]; then if [ \"${ISAAC_SIM_USE_STANDALONE}\" = 'true' ]; then - tmux send-keys -t isaac 'run_isaac_python /isaac-sim/AirStack/simulation/isaac-sim/launch_scripts/${ISAAC_SIM_SCRIPT_NAME}' ENTER + tmux send-keys -t isaac 'PYTHONPATH="$$ISAAC_SIM_PYTHONPATH" /isaac-sim/python.sh /isaac-sim/AirStack/simulation/isaac-sim/launch_scripts/${ISAAC_SIM_SCRIPT_NAME} --ext-folder ~/.local/share/ov/data/documents/Kit/shared/exts' ENTER else tmux send-keys -t isaac 'ros2 launch isaacsim run_isaacsim.launch.py install_path:=/isaac-sim gui:=\"${ISAAC_SIM_GUI}\" play_sim_on_start:=\"${PLAY_SIM_ON_START}\"' ENTER fi @@ -114,6 +114,12 @@ Key variables for Isaac Sim configuration: | `PLAY_SIM_ON_START` | Auto-play simulation when GUI opens | `true` | | `ISAAC_SIM_USE_STANDALONE` | Use standalone Python launch | `false` | | `ISAAC_SIM_SCRIPT_NAME` | Standalone script filename | - | +| `PX4_PHYSICS_HZ` | Physics step rate for PX4 SITL — also sets PX4 `IMU_INTEG_RATE` | `250` | +| `PX4_RENDERING_HZ` | Rendering frame rate for PX4 profiles (independent of physics) | `60` | +| `ARDUPILOT_PHYSICS_HZ` | Physics step rate for ArduPilot SITL | `800` | +| `ARDUPILOT_RENDERING_HZ` | Rendering frame rate for ArduPilot profiles | `120` | + +`PX4_PHYSICS_HZ` and `PX4_RENDERING_HZ` are set in the top-level `.env`. Pegasus defaults to 250 Hz but AirStack runs PX4 at **100 Hz** for near-real-time performance. See [Pegasus Scene Setup → Physics Rate](pegasus_scene_setup.md) for valid values and the full configuration flow. **Example overrides:** @@ -388,6 +394,11 @@ docker compose -f simulation/isaac-sim/docker/docker-compose.yaml build --no-cac - Inspect DDS: `fastdds.xml` configuration - Test connection: `ros2 topic list` in Isaac Sim container +**`rclpy` / `_rclpy_pybind11` warnings when starting Kit with `python.sh`:** + +- Jazzy’s `setup.bash` puts **Python 3.12** ROS packages on `PYTHONPATH`. Isaac’s `python.sh` uses **Kit Python (~3.10)**. Importing system `rclpy` from the wrong interpreter causes ABI errors in the log (topics from Omnigraph may still work). +- Standalone launch uses `PYTHONPATH="$ISAAC_SIM_PYTHONPATH"` in the **tmux** command (`$$ISAAC_SIM_PYTHONPATH` in `docker-compose.yaml` so Compose does not treat it as a host variable). See container `.bashrc` and `docker-compose.yaml`: it drops `lib/python3.12/site-packages` and appends the bridge’s internal `rclpy` path. + **Performance issues:** - Reduce scene complexity diff --git a/docs/simulation/isaac_sim/index.md b/docs/simulation/isaac_sim/index.md index 39f27258c..5f11f3ef5 100644 --- a/docs/simulation/isaac_sim/index.md +++ b/docs/simulation/isaac_sim/index.md @@ -19,7 +19,7 @@ Isaac Sim provides a wide range of high-fidelity, GPU-accelerated virtual sensor - Stereo and fisheye cameras -- LiDARs and Radars +- LiDARs and Radars (see [Pegasus scene setup](pegasus_scene_setup.md#rtx-lidar-near-range) for RTX OmniLidar in AirStack and near-range filtering on the robot stack) - IMUs, GPS, and odometry sensors diff --git a/docs/simulation/isaac_sim/overhead_camera.md b/docs/simulation/isaac_sim/overhead_camera.md new file mode 100644 index 000000000..a50a98301 --- /dev/null +++ b/docs/simulation/isaac_sim/overhead_camera.md @@ -0,0 +1,135 @@ +# 2D World Map in Foxglove + +There are two ways to render a 2D ground reference under your fleet in Foxglove's 3D panel: + +1. **Real-world satellite images** — for outdoor flights. Foxglove's built-in **Map** panel fetches satellite/road tiles from an online tile provider and pins your robots on them via GPS. +2. **Simulated overhead camera** — for sim. A static top-down orthographic camera captures the scene once and publishes it as an aerial image; the GCS renders it as a textured ground plane. + +Both paths render into the same global `map` frame, so robot markers, trajectories, and gossip payloads sit correctly on top in either case. + +## Real-World Satellite Images + +For outdoor flights, no special AirStack configuration is needed — the GCS publishes everything Foxglove's Map panel needs: + +- Each robot's GPS gets republished on `/gcs/{robot_name}/location` (with `frame_id='map'` so the Map panel accepts it). +- A stationary fix at `ORIGIN_LAT/LON` is published on `/gcs/map_origin/location` so the panel has a fixed anchor and doesn't auto-recenter on whichever robot moves first. + +In Foxglove: + +1. Add a **Map** panel. +2. Open its settings and pick a tile layer (e.g. **Custom (URL template)** with a satellite tile provider — Foxglove ships with OpenStreetMap by default; for satellite imagery you can use any standard `{z}/{x}/{y}` URL such as Esri's World Imagery). +3. Add `/gcs/map_origin/location` and each `/gcs/{robot}/location` topic. + +The Map panel will draw satellite tiles around the origin and show robot pins as they move. + +## Simulated Overhead Camera + +The overhead camera is a static, top-down orthographic camera that renders the simulated scene once and publishes it as an aerial image. The GCS picks it up and renders it as a textured ground plane in Foxglove's 3D panel — useful as a visual reference behind the robot markers, especially in scenes that don't have ground-truth satellite imagery. + +The scene is static, so the camera publishes briefly at startup, the GCS catches one valid frame, and both sides tear down their subscriptions. After that, the overhead is essentially free. + +![Foxglove 3D panel showing the textured overhead under the simulated scene](overhead_in_foxglove.png) + +## Enabling it in a sim launch script + +Two helpers from `simulation/isaac-sim/utils/scene_prep.py` do all the work. Call both inside the post-load callback (after the stage is loaded but before drones spawn): + +```python +from utils.scene_prep import ( + add_orthographic_camera, add_overhead_camera_publisher, + get_stage_meters_per_unit, +) + +mpu, scene_scale_factor = get_stage_meters_per_unit(stage) + +cam_path = add_orthographic_camera( + stage, + prim_path="/World/MapCamera", + altitude_m=OVERHEAD_ALTITUDE_M, + coverage_m=OVERHEAD_COVERAGE_M, + scene_scale_factor=scene_scale_factor, +) + +add_overhead_camera_publisher( + parent_graph_path="/World/MapCameraGraph", + camera_prim_path=cam_path, + topic="/sim/overhead/image", + spec_topic="/sim/overhead/spec", + frame_id="map", + coverage_m=OVERHEAD_COVERAGE_M, + pixels_per_meter=OVERHEAD_PX_PER_METER, + domain_id=0, +) +``` + +The three constants (`OVERHEAD_ALTITUDE_M`, `OVERHEAD_COVERAGE_M`, `OVERHEAD_PX_PER_METER`) are the only knobs you typically need to adjust. Defaults and effect: + +| Constant | Default | What it controls | +|---|---|---| +| `OVERHEAD_ALTITUDE_M` | `150.0` | Camera height above world origin (m). | +| `OVERHEAD_COVERAGE_M` | `200.0` | Side length of the captured square (m). | +| `OVERHEAD_PX_PER_METER` | `4.0` | Texture density. Increase for sharper text/markings; capped at `max_resolution=2048`. | + +### Re-centering or transforming the camera + +By default the camera sits over world origin `(0, 0)`. For an off-origin area of interest, pass `center_x_m` / `center_y_m` to both helpers — they take care of the camera xform and the spec topics the GCS reads: + +```python +CENTER_X_M, CENTER_Y_M = 50.0, -25.0 + +cam_path = add_orthographic_camera( + stage, prim_path="/World/MapCamera", + altitude_m=OVERHEAD_ALTITUDE_M, + coverage_m=OVERHEAD_COVERAGE_M, + scene_scale_factor=scene_scale_factor, + center_x_m=CENTER_X_M, + center_y_m=CENTER_Y_M, +) + +add_overhead_camera_publisher( + parent_graph_path="/World/MapCameraGraph", + camera_prim_path=cam_path, + topic="/sim/overhead/image", + spec_topic="/sim/overhead/spec", + center_x_topic="/sim/overhead/center_x", + center_y_topic="/sim/overhead/center_y", + frame_id="map", + coverage_m=OVERHEAD_COVERAGE_M, + center_x_m=CENTER_X_M, + center_y_m=CENTER_Y_M, + pixels_per_meter=OVERHEAD_PX_PER_METER, + domain_id=0, +) +``` + +## GCS side + +The GCS rendering is handled by `_build_sim_ground_marker` in `gcs/ros_ws/src/gcs_visualizer/gcs_visualizer/foxglove_visualizer_node.py`. It: + +1. Downsamples the source image to a coarse triangle grid (default 0.8 cells/m, capped at 384×384) — Foxglove's 3D panel struggles with dense per-pixel meshes, but a coarse triangle grid renders smoothly. +2. Publishes one `Marker` of type `TRIANGLE_LIST` on `/gcs/sim_ground`. + +### Hide the overhead in Foxglove + +The ground plane is just a marker on `/gcs/sim_ground`, so you can toggle it on or off in the 3D panel settings without touching any code: + +1. Click the gear icon on the **3D** panel. +2. Open **Topics → `/gcs/sim_ground`**. +3. Toggle visibility off to hide it. + +### Sharper rendering + +The default downsample (0.8 cells/m, cap 384) is conservative. To raise the rendered resolution, override two parameters on `foxglove_visualizer_node` in `gcs/ros_ws/src/gcs_visualizer/launch/gcs_visualizer.launch.xml`: + +```xml + + +``` + +To change other rendering behavior (alpha, lighting), edit `_build_sim_ground_marker` directly. To force a re-render, restart the GCS visualizer. + +## See also + +- [Spawning Drones](spawning_drones.md) — authoring a full multi-drone launch script +- [Pegasus Scene Setup](pegasus_scene_setup.md) — single-drone scene authoring +- [GCS Foxglove Visualization](../../gcs/foxglove.md) — what the visualizer publishes alongside the ground marker diff --git a/docs/simulation/isaac_sim/overhead_in_foxglove.png b/docs/simulation/isaac_sim/overhead_in_foxglove.png new file mode 100644 index 000000000..6d73704fd Binary files /dev/null and b/docs/simulation/isaac_sim/overhead_in_foxglove.png differ diff --git a/docs/simulation/isaac_sim/pegasus_scene_setup.md b/docs/simulation/isaac_sim/pegasus_scene_setup.md index ba3464758..da4293dc7 100644 --- a/docs/simulation/isaac_sim/pegasus_scene_setup.md +++ b/docs/simulation/isaac_sim/pegasus_scene_setup.md @@ -58,7 +58,7 @@ Example scripts are provided in `simulation/isaac-sim/launch_scripts/`. | Function | Purpose | |----------|---------| -| `scale_stage_prim(stage, prim_path, scale_factor)` | Applies a uniform XYZ scale transform to the prim at `prim_path`, clearing any existing xform ops first. Use `0.01` for Nucleus assets authored in centimetres; use `1.0` for assets already in metres. | +| `scale_stage_prim(stage, prim_path, scale_factor)` | Applies a uniform XYZ scale transform to the prim at `prim_path`, clearing any existing xform ops first. Use `0.01` for Nucleus assets authored in centimeters; use `1.0` for assets already in meters. | | `add_colliders(stage_prim)` | Recursively walks every child of `stage_prim` and applies `UsdPhysics.CollisionAPI` to each `UsdGeom.Mesh`. **Must be called or drones fall through the floor.** Skips prims that already have the API. | | `add_dome_light(stage, intensity=3500, exposure=-3)` | Adds a hemisphere light at `/World/DomeLight` (or updates it if it already exists). Pass `intensity` / `exposure` keyword arguments to override the defaults. | | `save_scene_as_contained_usd(source_usd_url, output_dir)` | Copies the stage and all its dependencies (textures, MDL materials) from a Nucleus `omniverse://` URL into a local directory via `omni.kit.usd.collect.Collector`. Set `SAVE_SCENE_TO = None` in your script to skip this step. | @@ -110,6 +110,15 @@ Scripts must live in `simulation/isaac-sim/launch_scripts/`. Set `ISAAC_SIM_SCRI | `scene_prep` not found / `ModuleNotFoundError` | `utils/` not on `sys.path` in Isaac Sim's Python | Use `sys.path.insert` to add the `utils/` directory before importing `scene_prep` | | Drone spawns at wrong height in cm-scale scene | Spawn coordinates not converted to stage space | Multiply metric `init_pos` values by `scene_scale` from `get_stage_meters_per_unit` | +## RTX OmniLidar and near range (`min_range`) — known limitation {#rtx-lidar-near-range} + +AirStack’s Pegasus fork (Isaac Sim **5.1+**) wires **RTX OmniLidar** through OmniGraph helpers such as `add_rtx_lidar_subgraph` in `pegasus.simulator.ogn.api.spawn_rtx_lidar` (used from `simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py`, `example_multi_px4_pegasus_launch_script.py`, etc.). Recent work in this repo switched those scripts from the legacy Ouster graph path to this **RTX** API and reconciled ROS topic names (e.g. raw cloud on `…/sensors/ouster/point_cloud_raw`, filtered consumer topic `…/sensors/ouster/point_cloud`). + +### `min_range` → `nearRangeM` in simulation + +The spawn code maps the Python argument **`min_range`** to the OmniLidar prim attribute **`omni:sensor:Core:nearRangeM`** when it exists, and logs a warning when it does not. The module docstring in `spawn_rtx_lidar.py` states the reality: some Kit builds only express echo spacing in the **vendor lidar JSON profile**, not as a writable **Core** prim attribute, so **setting near range in Isaac does not consistently remove short-range returns**. + +**Known bug / policy:** Do **not** rely on Isaac-only `min_range` / `nearRangeM` as your primary near-field cleanup. Use the **robot-side** package **`lidar_point_cloud_filter`** (see [Sensors — LiDAR filter](../../../robot/autonomy/sensors/index.md#lidar-point-cloud-filter)), which applies a configurable **`near_range_m`** sphere filter in the ROS graph and publishes a stable cloud for VDB, exploration, and RViz. ## Known bugs and workarounds for Scripted Scene Generation @@ -136,4 +145,55 @@ The drone not arming/taking off can be a symptom of the PX4Multirotor Node not b - To fix, launch the simulator with `airstack up isaac-sim`, in the toolbar, click Window -> Extensions -> Third Party, serach for "pegasus", select the "PEGASUS SIMULATOR" and enable "AUTOLOAD" - Restart your docker container by running `airstack down isaac-sim && airstack up isaac-sim` and the extension should load every time now. ---- \ No newline at end of file +--- + +## Physics Rate and IMU_INTEG_RATE + +The Isaac Sim physics step rate is the dominant factor in simulation performance. Pegasus defaults to **250 Hz**, which runs well below real-time when a full sensor suite is active. AirStack lowers this to **100 Hz** by reducing PX4's `IMU_INTEG_RATE` to match — since `IMU_INTEG_RATE` controls how often Isaac Sim steps the physics world in lockstep with PX4. 100 Hz is the minimum viable rate: PX4's EKF2 estimator has a fixed 10 ms update period, so `IMU_INTEG_RATE` must be at least 100 Hz or the state estimator falls behind sensor data. + +### Configuration + +Two variables in the top-level `.env` control the rates: + +```bash +# Pegasus physics/rendering rates (read by params.py; PX4_IMU_INTEG_RATE synced automatically). +# Minimum frequency supported by PX4 is 100 Hz. +PX4_PHYSICS_HZ="100" +PX4_RENDERING_HZ="30" +``` + +- **`PX4_PHYSICS_HZ`** — Sets `physics_dt = 1 / PX4_PHYSICS_HZ` in Isaac Sim's physics scene, and automatically syncs PX4's `IMU_INTEG_RATE` parameter to the same value via `PX4LaunchTool` → `px4-rc.simulator`. Patched within the Docker image to read the environment variable and set the IMU_INTEG_RATE parameter. +- **`PX4_RENDERING_HZ`** — Sets the rendering frame rate independently of physics. 30 Hz rendering has no effect on physics accuracy or PX4 behavior, but does slightly affect performance due to resource usage. + +### Valid values + +PX4's documented presets for `IMU_INTEG_RATE` are **100, 200, 250, 400 Hz**. The minimum is **100 Hz** — the EKF2 estimator runs at 100 Hz (10 ms period) and `IMU_INTEG_RATE` must be at least this fast. Values below 100 Hz are accepted by the firmware but cause attitude controller oscillation and are not recommended. + +- `100` — AirStack default; stable, best real-time performance with sensors +- `200` — good balance, but will get slower +- `250` — Pegasus/PX4 SITL default; best control quality but slower than real-time with sensors +- `400` — maximum recommended. Untested + +--- + +## Performance Benchmarking + +A benchmarking suite in `simulation/isaac-sim/extensions/PegasusSimulator/benchmarking/` measures the real-time factor (RTF = simulated seconds / wall-clock seconds) across physics rates, scene complexities, and drone backends. + +### Running the suite + +```bash +# Full suite (24 scripts) — from outside the container +docker exec isaac-sim bash -c \ + "/isaac-sim/python.sh /isaac-sim/AirStack/simulation/isaac-sim/extensions/PegasusSimulator/benchmarking/run_all.py" + +# Single script +docker exec isaac-sim bash -c \ + "/isaac-sim/python.sh /isaac-sim/AirStack/simulation/isaac-sim/extensions/PegasusSimulator/benchmarking/1_cube_no_pegasus.py" + +# Subset by number +docker exec isaac-sim bash -c \ + "/isaac-sim/python.sh /isaac-sim/AirStack/simulation/isaac-sim/extensions/PegasusSimulator/benchmarking/run_all.py --scripts 1-8" +``` + +See `benchmarking/README.md` for the full script inventory, output format, and analysis plots. \ No newline at end of file diff --git a/docs/simulation/isaac_sim/spawning_drones.md b/docs/simulation/isaac_sim/spawning_drones.md new file mode 100644 index 000000000..f5919ecec --- /dev/null +++ b/docs/simulation/isaac_sim/spawning_drones.md @@ -0,0 +1,164 @@ +# Spawning Drones + +The reference launch scripts under `simulation/isaac-sim/launch_scripts/` cover the progression from a single drone in an empty world up to multiple drones in a custom imported scene with per-drone GPS origins: + +| Script | Purpose | +|---|---| +| `barebones_pegasus_launch.py` | Minimal Pegasus boilerplate. Single drone, default environment, no scene import. Use as a template for new launch scripts. | +| `example_one_px4_pegasus_launch_script.py` | One PX4 drone with the standard sensor stack (ZED stereo, optional Ouster lidar) in the default environment. | +| `example_multi_px4_pegasus_launch_script.py` | `NUM_ROBOTS` drones spawned in a row in the default environment. Each drone gets its own ROS domain id (`1..N`). | +| `example_multi_drone_scene_import.py` | `NUM_ROBOTS` drones in an **imported scene** (USD from a Nucleus server) with per-drone GPS homes set via `gps_utils.set_gps_origins`. Use this as the starting point for any custom scene. | + +The first three are vanilla Pegasus patterns; this page focuses on the multi-drone + custom-scene case where you also need correct GPS homes. + +## The DRONE_CONFIGS pattern + +`example_multi_drone_scene_import.py` declares all per-drone settings in a single list: + +```python +DRONE_CONFIGS = [ + {"domain_id": 1, "x_m": -3.0, "y_m": 3.5, "z_m": 0.15, "orient": [0, 0, 0, 1]}, + {"domain_id": 2, "x_m": 3.0, "y_m": 3.0, "z_m": 0.15, "orient": [0, 0, 0, 1]}, +] +``` + +| Field | Purpose | +|---|---| +| `domain_id` | ROS domain id and PX4 vehicle id. The robot container with `ROS_DOMAIN_ID=1` will see this drone. | +| `x_m`, `y_m`, `z_m` | World-frame spawn position in meters. Convention: `+X = East`, `+Y = North`, `+Z = Up`. | +| `orient` | Spawn orientation quaternion `[x, y, z, w]`. | + +To add another drone, append an entry with a fresh `domain_id` and a non-overlapping spawn position. Make sure the corresponding robot container is launched with the same `ROS_DOMAIN_ID` (`NUM_ROBOTS=N airstack up robot-desktop`). + +## Per-drone GPS home — `gps_utils` + +PX4 needs a GPS home per vehicle. `simulation/isaac-sim/launch_scripts/gps_utils.py` derives one from each drone's world-frame spawn position so all drones share a consistent geographic anchor and end up at distinct GPS coordinates spaced according to their spawn offsets. + +```python +from gps_utils import set_gps_origins, DEFAULT_WORLD_ORIGIN + +set_gps_origins(DRONE_CONFIGS) # call once before spawning vehicles +``` + +`set_gps_origins` does two things per drone: + +1. Calls `compute_gps_origin(x_m, y_m, z_m, world_origin)` to convert the spawn offset into `(lat, lon, alt)`. The conversion is a flat-Earth approximation — accurate at scene scales (a few hundred meters), not at continental scale. +2. Writes `PX4_HOME_LAT_`, `PX4_HOME_LON_`, `PX4_HOME_ALT_` into the process environment. The Pegasus PX4 OmniGraph node reads these when building each drone's `PX4MavlinkBackendConfig`, which passes them to the PX4 SITL subprocess as `PX4_HOME_LAT/LON/ALT`. + +### World anchor + +The world origin maps to `DEFAULT_WORLD_ORIGIN = (38.736832, -9.137977, 90.0)` — Lisbon, matching the Pegasus default. Override it for a scene set elsewhere: + +```python +set_gps_origins(DRONE_CONFIGS, world_origin=(40.4433, -79.9436, 280.0)) # Pittsburgh +``` + +The anchor only affects the geographic location reported via GPS; nothing in the scene moves. Pick something close to where you want the drones to "be" — Foxglove's Map panel will center on it, and any GPS-referenced inputs to your stack will be relative to it. + +## Scene prep helpers — `scene_prep.py` + +`simulation/isaac-sim/utils/scene_prep.py` is the small toolbox of stage preparation helpers `example_multi_drone_scene_import.py` uses inside its post-load callback (after the stage is loaded, before drones spawn). The full file has more — what's documented here is what you'll reach for in 95% of scenes. + +```python +from utils.scene_prep import ( + get_stage_meters_per_unit, scale_stage_prim, add_colliders, + add_dome_light, save_scene_as_contained_usd, + add_orthographic_camera, add_overhead_camera_publisher, +) + +mpu, scene_scale_factor = get_stage_meters_per_unit(stage) +``` + +### Scaling — `scale_stage_prim` + +USD scenes are authored at all sorts of stage units. To apply a uniform scale to the imported stage root once, before drones spawn: + +```python +STAGE_SCALE = 0.01 # cm → m +scale_stage_prim(stage, "/World/stage", STAGE_SCALE) +``` + +### Colliders — `add_colliders` + +Recursively applies `UsdPhysics.CollisionAPI` to every `UsdGeom.Mesh` under the given prim. Imported environment USDs are usually visual-only; without this, drones fall through buildings. + +```python +stage_prim = stage.GetPrimAtPath("/World/stage") +add_colliders(stage_prim) +``` + +Skips prims that already have the API applied. Run it on the stage root after `scale_stage_prim` returns. + +### Lighting — `add_dome_light` + +In case the scene is missing any lights, this adds a dome light that can act like an overhead 'sun'. + +```python +add_dome_light( + stage, + prim_path="/World/DomeLight", + intensity=3500.0, + exposure=-5.0, # negative = darker; tune per scene +) +``` + +### Overhead camera — `add_orthographic_camera` + `add_overhead_camera_publisher` + +Used as a pair: one drops an orthographic camera over the scene, the other wires an OmniGraph to publish its frame plus three Float32 spec topics (`coverage_m`, `center_x_m`, `center_y_m`) the GCS uses to texture a ground plane in Foxglove's 3D panel. + +```python +cam_path = add_orthographic_camera( + stage, + prim_path="/World/MapCamera", + altitude_m=165.0, + coverage_m=225.0, + scene_scale_factor=scene_scale_factor, + center_x_m=0.0, # set if your area of interest isn't at world origin + center_y_m=0.0, +) +add_overhead_camera_publisher( + parent_graph_path="/World/MapCameraGraph", + camera_prim_path=cam_path, + topic="/sim/overhead/image", + spec_topic="/sim/overhead/spec", + center_x_topic="/sim/overhead/center_x", + center_y_topic="/sim/overhead/center_y", + frame_id="map", + coverage_m=225.0, + center_x_m=0.0, + center_y_m=0.0, + pixels_per_meter=10.0, + domain_id=0, +) +``` + +Full setup, GCS-side rendering, and tuning knobs are on the **[Overhead Camera](overhead_camera.md)** page. + +### Saving a self-contained copy — `save_scene_as_contained_usd` + +For scenes you'd like to keep working with offline (no Nucleus connection), or for sharing a scene with collaborators, collect the root USD plus every referenced asset (textures, MDLs, sublayers) into a local directory: + +```python +save_scene_as_contained_usd( + source_usd_url=ENV_URL, + output_dir="/tmp/collected_scene", +) +``` + +The collected folder contains a standalone root USD with relative references — load it directly via `omniverse://localhost/...` or a local file path. Note that this collects the **source USD as-is**: scale, colliders, dome light, and any other stage edits applied in this post-load callback are *not* baked into the saved copy. To capture the live stage with your modifications, first export the in-memory stage to a USD on disk (e.g. via `stage.GetRootLayer().Export(...)`) and pass that exported path as `source_usd_url`. + +## Common issues + +| Symptom | Likely cause | Fix | +|---|---|---| +| Drone shows up at the world origin in Foxglove despite being elsewhere in sim | `set_gps_origins` not called, or called *after* spawn | Move the call before vehicle spawning | +| All drones share one GPS coordinate | `domain_id` collision in `DRONE_CONFIGS` | Give each drone a unique `domain_id` | +| Map panel centers on the wrong city | Wrong `world_origin` | Override the second arg to `set_gps_origins` | +| Drone position drifts in the wrong compass direction | Stage axis mismatch | Swap `x_m` ↔ `y_m` in `gps_utils.compute_gps_origin` | +| Robot container can't see the drone's topics | `ROS_DOMAIN_ID` ≠ `domain_id` in DRONE_CONFIGS | Match them, or set `NUM_ROBOTS` correctly | + +## See also + +- [Pegasus Scene Setup](pegasus_scene_setup.md) — single-drone authoring background +- [Overhead Camera](overhead_camera.md) — topdown ground texture +- [GCS Foxglove Visualization](../../gcs/foxglove.md) — how multi-robot poses render on the GCS diff --git a/docs/tutorials/airstack_on_osmo.md b/docs/tutorials/airstack_on_osmo.md new file mode 100644 index 000000000..e9cfa8974 --- /dev/null +++ b/docs/tutorials/airstack_on_osmo.md @@ -0,0 +1,591 @@ +# AirStack on OSMO — Recommended Remote Development Workflow + +This is AirStack's recommended day-to-day development path going forward. +You submit one OSMO workflow that spins up a GPU pod running the full +three-container AirStack stack (Isaac Sim, robot-desktop, GCS), attach VS +Code or Cursor to it over Remote-SSH, and stream Isaac Sim and the GCS +Foxglove dashboard back to your browser. + +Why this is the recommended path: + +- **Pooled GPUs.** A lab's GPUs are shared on-demand across the whole team + instead of pinned one-per-desktop. Onboarding doesn't require buying + hardware. +- **No local CUDA / Docker / driver maintenance.** Your laptop just needs + `git`, an SSH key, and an IDE. macOS, Windows, and Linux all work + identically. +- **Same image as CI and field robots.** The OSMO pod runs the exact + Docker images that the system tests and deployed robots run, so your + dev environment can't drift away from production. +- **One-command onboarding.** A new student goes from zero to "Isaac Sim + streaming into my browser" with `airstack osmo:setup` followed by + `airstack osmo:up` — no install marathon. +- **Hardware bigger than your laptop.** The pod has more CPU/RAM/GPU than + most dev laptops, even if you have a GPU laptop. + +> **Still want local development on a Linux+GPU desktop?** It works and +> can be faster for tight inner loops — see +> [Getting Started](../getting_started/index.md). It just isn't the +> recommended default anymore. + +## Who is this for? + +Anyone developing AirStack — Mac, Windows, or Linux, with or without a +local GPU. + +You're comfortable using `git` from a terminal, you have an SSH key +(`~/.ssh/id_ed25519` or similar), and you have either VS Code or Cursor +installed. That's the entire local-machine bar. + +## Architecture in a sentence + +`airstack osmo:up` (which wraps `osmo workflow submit`) spins up a GPU pod +that runs sshd plus a Docker-in-Docker daemon. Inside that pod, `airstack +up` brings up the familiar three AirStack containers (Isaac Sim, +robot-desktop, GCS). Your IDE attaches over Remote-SSH; Isaac Sim and +Foxglove are reached via separate port-forwards. + +```mermaid +flowchart LR + subgraph laptop [Your laptop] + ide[VS Code or Cursor + Remote-SSH] + osmo[osmo CLI] + fox[app.foxglove.dev] + webrtc[Isaac Sim WebRTC client] + end + subgraph pod [OSMO workspace pod - GPU] + sshd[sshd] + inner[Inner dockerd] + isaac[isaac-sim container] + robot[robot-desktop container] + gcs[gcs container] + end + osmo -- submit and port-forward --> pod + ide -- ssh on 2200 --> sshd + fox -- ws on 8766 --> gcs + webrtc -- "WebRTC on 49100/tcp, 49099/udp" --> isaac + inner --> isaac + inner --> robot + inner --> gcs +``` + +## Prerequisites + +| You need | Why | +|---|---| +| A local clone of AirStack (`git clone https://github.com/castacks/AirStack.git`) | The `airstack osmo:*` wrappers, the workflow YAML, and the Foxglove extensions all live in the repo | +| The [`osmo` CLI](https://github.com/NVIDIA/OSMO) on your `PATH` | Submitting workflows and port-forwarding | +| `osmo login` done once | Stores your auth token in `~/.config/osmo` | +| An SSH keypair (e.g. `~/.ssh/id_ed25519`) | The pod authorises your pubkey at submit time. Generate one with `ssh-keygen -t ed25519` if you don't already have one. | +| **VS Code with the Remote-SSH extension** *or* **Cursor with its Remote-SSH equivalent** | Where you'll actually edit AirStack code | +| Optional: Foxglove desktop app, or just `app.foxglove.dev` | View ROS topics | +| Optional: an Omniverse Streaming Client / WebRTC browser client | View the streamed Isaac Sim render | + +You **do not** need: Docker, NVIDIA drivers, `airstack install`, `airstack +setup`, sudo, or Linux. + +> **Lab admin prerequisites (someone else's job, once).** A lab admin +> pushes the `airstack-osmo-workspace` image to +> `airlab-docker.andrew.cmu.edu`. Details in +> [`osmo/README.md`](https://github.com/castacks/AirStack/blob/main/osmo/README.md). +> +> **Your job, once:** the next step. + +## Step 0 — Register your OSMO credentials (one time) + +OSMO credentials are **per-user** (each Andrew ID has its own Nucleus token, +its own AirLab Docker password, its own OSMO profile). You register them +once with the `osmo` CLI on your laptop and OSMO injects them into every +workflow you submit afterwards. They never leave your OSMO profile and your +laptop never sees the values again. + +You need three credentials. The exact names matter — the workflow YAML +references them by these exact names. + +From your AirStack clone, run: + +```bash +git clone https://github.com/castacks/AirStack.git +cd AirStack +./airstack.sh osmo:setup +``` + +This prompts for your Andrew ID, AirLab Docker password, and Nucleus API +token (get one at → +right-click cloud icon → **API Tokens** → Create), then registers the +three credentials with OSMO. The values go directly to your OSMO profile +— nothing is written to local disk. + +> **macOS prereq: bash 4+.** macOS ships bash 3.2 by default and the +> `airstack` CLI needs bash 4+. If you see +> `airstack.sh requires bash 4 or newer`, install a modern bash with: +> +> ```bash +> brew install bash +> ``` +> +> No further config needed — `airstack.sh` auto-detects the Homebrew bash +> at `/opt/homebrew/bin/bash` (Apple Silicon) or `/usr/local/bin/bash` +> (Intel) and re-execs under it. You don't need to change your login shell. + +### Verify + +List your credentials: + +```bash +osmo credential list +``` + +You should see all three (`airlab-docker-registry`, `airlab-docker-login`, +`airlab-nucleus`). To rotate any of them later, just re-run +`./airstack.sh osmo:setup`. + +
+Under the hood — the three raw `osmo credential set` calls + +`airstack osmo:setup` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_setup`) is equivalent to running these three commands by hand +— useful for debugging or rotating one credential at a time: + +```bash +# 1. AirLab Docker registry (REGISTRY) — for OSMO's outer image-pull of +# airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace +osmo credential set airlab-docker-registry \ + --type REGISTRY \ + --payload registry=airlab-docker.andrew.cmu.edu \ + username= \ + auth='' + +# 2. AirLab Docker login (GENERIC) — for the *inner* dockerd inside the +# pod to `docker login` and pull the AirStack image set +osmo credential set airlab-docker-login \ + --type GENERIC \ + --payload username= \ + password='' + +# 3. AirLab Nucleus (GENERIC) — for Isaac Sim to authenticate against +# omniverse://airlab-nucleus.andrew.cmu.edu (API token, NOT password) +osmo credential set airlab-nucleus \ + --type GENERIC \ + --payload omni_user= \ + omni_pass='' \ + omni_server=omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1 +``` + +
+ +> **Why three credentials?** It's tempting to consolidate. The reason for +> the split: OSMO REGISTRY credentials drive Kubernetes `imagePullSecrets` +> (auto-attached, never exposed as env vars), while GENERIC credentials are +> what get injected as env vars inside the running container. The pod +> needs **both** kinds of access — outer pull of the workspace image, plus +> inner login from the inner dockerd to pull AirStack images. + +## Step 1 — Add an SSH config entry (one time) + +VS Code and Cursor's Remote-SSH "Connect to Host…" picker reads +`~/.ssh/config`. Add this block once and the host shows up by name forever: + +```bash +cat >> ~/.ssh/config <<'EOF' + +Host airstack-osmo + HostName localhost + Port 2200 + User root + # Every OSMO workflow boots a fresh pod with a fresh sshd host key, so + # any saved fingerprint for [localhost]:2200 will be wrong on the next + # `airstack osmo:up`. Skip the host-key check here: this alias only + # connects via the local port-forward, so the security boundary is + # OSMO's authenticated control-plane tunnel — not the SSH fingerprint. + # /dev/null keeps known_hosts clean (no stale entries pile up); LogLevel + # ERROR silences the "Permanently added [localhost]:2200" banner. + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR + # SSH agent forwarding so `git push` from inside the pod uses your + # local laptop's SSH key (the pod's sshd has AllowAgentForwarding yes + # baked in by osmo/workspace/sshd_config). Without this, the pod has + # no key to push to github.com with — its ~/.ssh/ only holds the + # authorized_keys file for inbound connections. + ForwardAgent yes + # macOS Keychain integration — first push from the pod auto-loads + # your key into the local ssh-agent and unlocks it via the system + # keychain (no passphrase prompts). Harmless on Linux: those clients + # ignore the option. AddKeysToAgent works on both OSes. + AddKeysToAgent yes + UseKeychain yes +EOF +``` + +The `localhost:2200` is what we'll port-forward to in step 4. + +> **Already added the old block?** If your `~/.ssh/config` still has +> `StrictHostKeyChecking accept-new` for `airstack-osmo` from an earlier +> setup, replace it with the three lines above. As a one-time cleanup of +> the stale fingerprint left behind by previous pods, also run: +> +> ```bash +> ssh-keygen -R "[localhost]:2200" +> ``` +> +> `airstack osmo:ide` does this scrub for you on every run, so you only +> need it once when migrating. + +> **Smoke-test the agent forward** once the pod is up: SSH in and run +> `ssh-add -l` — you should see your local key listed. If you see "The +> agent has no identities", run `ssh-add ~/.ssh/id_ed25519` on your +> laptop and reconnect. + +## Step 2 — Submit the workflow + +From the AirStack clone: + +```bash +./airstack.sh osmo:up --pool airstack +``` + +This submits +[`osmo/workflows/airstack-dev.yaml`](https://github.com/castacks/AirStack/blob/main/osmo/workflows/airstack-dev.yaml) +with two things injected: + +- your local SSH pubkey as `SSH_PUB_KEY` — that's what authorises + **your** key on **this** workflow (each student passes their own at + submit time; the lab admin doesn't manage a global `authorized_keys` + file). +- `AIRSTACK_BRANCH` set to your local repo's current branch — the pod + ignores your laptop's working tree (it's ephemeral and runs in a + different machine room) and clones AirStack fresh from GitHub on + every workflow start, so this is how it knows which branch to use. + Override with `--branch main` if you want the pod to track main even + while you're on a feature branch. + +> **The pod clones from GitHub, not your laptop.** Local edits (and +> commits you haven't pushed) won't make it into the pod. `airstack +> osmo:up` warns you up-front if your branch is ahead of origin or has +> uncommitted changes — `git push` first if you want the pod to pick +> them up. + +`airstack osmo:up` prints a workflow id like `airstack-dev-1` and stores +it in `~/.airstack/osmo-state`, so the rest of the `airstack osmo:*` +commands in this tutorial pick it up automatically — no `export WF=...` +needed. To target a specific workflow for a single invocation, export +`AIRSTACK_OSMO_WF=`. + +
+Under the hood — raw `osmo workflow submit` + +`airstack osmo:up` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_up`) is equivalent to: + +```bash +osmo workflow submit osmo/workflows/airstack-dev.yaml \ + --pool airstack \ + --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)" +``` + +Save the printed workflow id as `$WF` if you're using the raw form, and +substitute it for `airstack osmo:*` in the rest of the tutorial. + +
+ +## Step 3 — Wait for the stack to come up + +Tail the lead task's logs and watch for milestones: + +```bash +./airstack.sh osmo:logs +``` + +Expected milestones, in order (each is one line in the log): + +1. `[entrypoint] sshd listening on :22` — VS Code/Cursor can attach. +2. `[entrypoint] dockerd ready` — the inner Docker daemon is up. +3. `Successfully built airstack_isaac-sim` *(or `Pulled` if pre-built)* — + the image set is in place. +4. `isaac-sim-livestream ... started` +5. `airstack-robot-desktop-1 ... started` +6. `airstack-gcs-1 ... started` + +If step (1) appears, you can attach the IDE while the rest is still +spinning up — the bring-up will continue in the background. + +
+Under the hood — raw `osmo workflow logs` + +`airstack osmo:logs` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_logs`) just exec's: + +```bash +osmo workflow logs $WF -t workspace -n 500 +``` + +The `osmo` CLI's `workflow logs` command prints the last N lines and then +keeps the stream open as new lines arrive (it already behaves like `tail +-f`, even though `--help` only documents `-n LAST_N_LINES`). Ctrl+C to +stop. Override the task / tail length with `OSMO_LOGS_TASK` / +`OSMO_LOGS_TAIL` env vars. + +
+ +## Step 4 — Forward sshd and attach the IDE + +In one terminal, run: + +```bash +./airstack.sh osmo:ide +``` + +This (a) starts the `localhost:2200 → pod:22` port-forward with a 24h +connect-timeout (matching the workflow's `exec_timeout`), waits for the +tunnel to come up, then (b) launches Cursor or VS Code (whichever it +finds on `PATH`) pre-attached to +`vscode-remote://ssh-remote+airstack-osmo/root/AirStack`. **Leave the +terminal running** for the length of your session — closing it tears the +tunnel down. + +The IDE installs its remote server in the pod on first connect (~50 MB, +slower on a fresh pod, cached on subsequent connects). Then: + +1. The IDE should open `/root/AirStack` automatically. (If not: + **Open Folder…** → `/root/AirStack`.) +2. Open the integrated terminal — you're root in `/root/AirStack`. +3. Edit code in the IDE; the changes land directly on the pod's disk. + +Verify everything is wired up by running: + +```bash +docker ps +``` + +You should see four containers: `airstack-isaac-sim-livestream-1`, +`airstack-robot-desktop-1`, `airstack-gcs-1`, plus the AirStack CLI helper. + +
+Under the hood — raw port-forward + manual IDE attach + +`airstack osmo:ide` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_ide`) is equivalent to running the port-forward by hand: + +```bash +osmo workflow port-forward $WF workspace --port 2200:22 --connect-timeout 86400 +``` + +…then in the editor: + +- **VS Code:** Command Palette → **Remote-SSH: Connect to Host…** → pick + `airstack-osmo`. +- **Cursor:** the same flow under its remote-development menu. + +Add `--no-open` to `airstack osmo:ide` to only run the port-forward and +attach the IDE manually. + +
+ +## Step 5 — Pick a feature branch and start working + +The pod cloned `main` into `/root/AirStack` on startup. Treat it like any +git working tree: + +```bash +git checkout -b my-feature +# edit code in the IDE... +bws --packages-select # build inside the robot-desktop container per AGENTS.md +``` + +Standard ROS 2 commands work from the integrated terminal: + +```bash +docker exec airstack-robot-desktop-1 bash -c "ros2 node list" +docker exec airstack-robot-desktop-1 bash -c "ros2 topic hz /robot_1/odometry" +``` + +This is the same `docker exec` pattern documented in +[AGENTS.md](https://github.com/castacks/AirStack/blob/main/AGENTS.md) — the +fact that you're on a remote pod is invisible from inside the IDE. + +## Step 6 — View Isaac Sim (WebRTC livestream) + +Isaac Sim runs headless inside the pod with the Kit +`omni.kit.livestream.webrtc` extension enabled (configured by the +`isaac-sim-livestream` Compose profile). To view it locally: + +```bash +./airstack.sh osmo:webrtc +``` + +This spawns the UDP port-forward (media, `49099`) in the background and +runs the TCP port-forward (signaling, `49100`) in the foreground — leave +that terminal running. + +Then point the **Omniverse Streaming Client** (or a WebRTC-capable browser +client) at `http://localhost`. The simulation viewport shows up the same +way it would on a local Linux desktop. + +
+Under the hood — raw TCP + UDP port-forwards + +`airstack osmo:webrtc` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_webrtc`) is equivalent to running the two raw port-forwards +in separate terminals — Kit's WebRTC needs both TCP signaling and UDP +SRTP media, and the AirStack workflow pins both to single ports rather +than scanning the Kit default range: + +```bash +# Terminal A — TCP signaling (49100): +osmo workflow port-forward $WF workspace --port 49100 --connect-timeout 86400 + +# Terminal B — UDP media (49099, pinned by the Pegasus launch script): +osmo workflow port-forward $WF workspace --port 49099 --udp --connect-timeout 86400 +``` + +
+ +## Step 7 — View ROS topics in Foxglove + +The GCS container runs `foxglove_bridge` on container-port `8765`, +published as host-port `8766` on the workspace pod. To install the +AirStack Foxglove extensions locally and forward the websocket in one +step: + +```bash +./airstack.sh osmo:foxglove +``` + +This copies the AirStack Foxglove extensions (Robot Tasks, Waypoint +Editor, Polygon Editor) into your local Foxglove Desktop user-extensions +dir (default `~/.foxglove-studio/extensions`; override with +`OSMO_FOXGLOVE_EXT_DIR`, skip with `OSMO_FOXGLOVE_SKIP_EXTENSIONS=1` for +`app.foxglove.dev` which doesn't load local extensions), then runs the +`localhost:8766 → pod:8766` port-forward in the foreground — leave the +terminal running. + +Then in [https://app.foxglove.dev](https://app.foxglove.dev) (or Foxglove +Desktop): + +1. **Open connection** → `ws://localhost:8766`. +2. **Layouts** → **Import from file** → + [`gcs/foxglove_extensions/airstack_default.json`](https://github.com/castacks/AirStack/blob/main/gcs/foxglove_extensions/airstack_default.json) + from your AirStack clone. +3. Pick the imported layout from the layout dropdown in the top-right. + +The full Foxglove flow — layout import, panel customisation, DDS bridge +naming — is documented at +[Foxglove Visualization](../gcs/foxglove.md). The only OSMO-specific +difference is the `osmo:foxglove` line in front of it. + +
+Under the hood — raw `osmo workflow port-forward` + +`airstack osmo:foxglove` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_foxglove`) wraps the extension install plus: + +```bash +osmo workflow port-forward $WF workspace --port 8766:8766 --connect-timeout 86400 +``` + +Set `OSMO_FOXGLOVE_SKIP_EXTENSIONS=1` to only run the port-forward. + +
+ +## Step 8 — Commit and push from inside the IDE + +The pod's filesystem is **ephemeral**. The persistence boundary is git, not +disk. Commit and push every meaningful chunk of work — a Source Control +panel commit + push, or in the integrated terminal: + +```bash +git add -A +git commit -m "WIP: feature X" +git push -u origin my-feature +``` + +Once your branch is on the remote, you can pull it from anywhere — your +laptop, a fresh pod tomorrow, a colleague's machine. + +> **Configuring git auth in the pod.** The pod is yours for the session. +> Inside the IDE's integrated terminal, set `git config user.name`, +> `user.email`, and configure your push auth (HTTPS + a GitHub PAT, or a +> per-pod SSH key the IDE forwards via `AllowAgentForwarding yes`). The +> `airstack-osmo-workspace` image deliberately does not bake any one +> student's git creds. + +## Step 9 — Tearing down + +When you're done: + +```bash +./airstack.sh osmo:down +``` + +This prints a 5-second warning then cancels the workflow stored in +`~/.airstack/osmo-state`. Hit Ctrl-C in the grace window if you submitted +by accident. + +> **Push first.** Anything that's still in your working tree, in `.git/` +> but not pushed, in `build/`, in `bags/`, or in `/root/` outside the repo +> **will be lost** on cancel. The pod is cattle. If you forget and need +> something pulled out, see "I forgot to push before tearing down" below +> *before* hitting cancel. + +
+Under the hood — raw `osmo workflow cancel` + +`airstack osmo:down` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_down`) is equivalent to: + +```bash +osmo workflow cancel $WF +``` + +
+ +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---|---|---| +| `Remote-SSH: Connection refused` after a working session | Port-forward died (laptop slept, network blip) | Re-run `./airstack.sh osmo:ide` | +| `Permission denied (publickey)` on Remote-SSH | The pod authorised a different pubkey than the one your local SSH client is offering | Confirm `cat ~/.ssh/id_ed25519.pub` matches the key that was injected at submit time. Re-submit with `./airstack.sh osmo:down && ./airstack.sh osmo:up --pool airstack`. | +| `airstack osmo:logs` shows `ERROR: SSH_PUB_KEY not set` | The submit didn't inject a pubkey (e.g. you ran raw `osmo workflow submit` without `--set-env`) | `./airstack.sh osmo:down`, then resubmit with `./airstack.sh osmo:up --pool airstack` (it injects `SSH_PUB_KEY` automatically). | +| `docker pull` fails inside the pod with `unauthorized` | Your `airlab-docker-login` credential is missing or has the wrong Andrew ID/password | Re-run `./airstack.sh osmo:setup`. | +| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." with the auth-service log showing `InternalCredentials.auth … 'username': '' … status: 'DENIED'` (no `Tokens.auth_with_api_token` call) | The pod is doing **password auth** instead of **API-token auth**. Inside the pod, `simulation/isaac-sim/docker/omni_pass.env` must have `OMNI_USER=$$omni-api-token` (literal `$$`, the sentinel for API-token auth — docker-compose v2 collapses `$$` to `$` on its way to the container). The OSMO entrypoint sets this automatically when `OMNI_PASS` looks like a JWT; if you see `OMNI_USER=` in the file, recreate the container with `docker compose --profile desktop --profile isaac-sim-livestream up -d isaac-sim-livestream` (`restart` does NOT re-read `env_file`). | +| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." with the auth-service log showing `Tokens.auth_with_api_token … status: 'DENIED'` | Your `airlab-nucleus` API token is missing, expired, or revoked (rotation invalidates the predecessor). Confirm by SSH'ing the Nucleus host and running `sudo docker logs --tail 200 base_stack-nucleus-auth-1`. Regenerate the token at , then `./airstack.sh osmo:setup` and `./airstack.sh osmo:down && ./airstack.sh osmo:up --pool airstack` to resubmit (or live-edit `simulation/isaac-sim/docker/omni_pass.env` in the pod and recreate the `isaac-sim-livestream` container — see row above). | +| Isaac Sim container restarts repeatedly | GPU not visible to the inner Docker daemon (toolkit not configured on the node) | Lab admin task. From inside the pod: `docker info \| grep -i runtime` should list `nvidia`. | +| Isaac Sim is up but the WebRTC stream is blank | The Pegasus script isn't getting `--/app/livestream/enabled=true`, or the wrong Compose profile is active | In the integrated terminal: `docker logs airstack-isaac-sim-livestream-1`. Confirm `ISAAC_SIM_LIVESTREAM=true` and that the `isaac-sim-livestream` profile is the one running (`docker ps`). | +| Foxglove "no connection" | Port-forward died, GCS container hasn't started yet, or browser is caching an old connection | Re-run `./airstack.sh osmo:foxglove`; check `docker ps` shows `airstack-gcs-1` Up; try `ws://127.0.0.1:8766` instead of `ws://localhost:8766`. | +| First Remote-SSH connect takes forever | VS Code / Cursor downloading its remote server (~50 MB) into the fresh pod | Wait it out the first time. Subsequent connects to the same pod hit the cache. | +| **I forgot to push before tearing down** | The pod is still up; cancel hasn't fired yet | Don't run `./airstack.sh osmo:down`. SSH in via the existing port-forward (`./airstack.sh osmo:ide --no-open` if the tunnel is gone), push from the IDE terminal, *then* tear down. If the workflow has already terminated and the pod is gone, the work is gone — git is the only persistence layer. | + +## What survives `airstack osmo:down`? + +| Artifact | Lives in | Survives? | +|---|---|---| +| Code committed and pushed to a feature branch | GitHub | **Yes** | +| Code committed but not pushed | Pod-local `.git` | **No** | +| Uncommitted edits in the IDE | Pod-local working tree | **No** | +| `colcon build` outputs (`build/`, `install/`, `log/`) | `/root/AirStack/**/ros_ws/...` | **No** (gitignored Linux x86_64 binaries; rebuild trivially) | +| Inner-dockerd image cache | Pod-local Docker layer cache | **No** | +| Bag files, sim recordings, debug screenshots | `/root/AirStack/bags/`, etc. | **No** — pull selectively via `osmo workflow rsync download "$(cat ~/.airstack/osmo-state)" :` *before* tearing down | + +The rule of thumb: **commit + push every time you'd save a file in a +git-tracked sense.** The Source Control panel is the persistence boundary. + +## See also + +- [`osmo/README.md`](https://github.com/castacks/AirStack/blob/main/osmo/README.md) + — lab-admin reference (pool prerequisites, OSMO credential registration, + workspace image build, validation stages). +- [Foxglove Visualization](../gcs/foxglove.md) — full layout import + + panel-customisation flow once your `airstack osmo:foxglove` is up. +- [AGENTS.md](https://github.com/castacks/AirStack/blob/main/AGENTS.md) — + inside-the-pod workflow once you're attached: `bws`, `sws`, `docker exec`, + ROS 2 commands. +- [Getting Started](../getting_started/index.md) — the local-Linux-GPU + alternative. diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index d1606f4e3..08dbbdaee 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -5,6 +5,7 @@ Step-by-step guides for common AirStack workflows. If you are new, start with ** | Tutorial | Description | |---|---| | [Getting Started](../getting_started.md) | Install AirStack, pull Docker images, launch a simulated robot, and fly it for the first time. | +| [AirStack on OSMO (Mac/Windows OK)](airstack_on_osmo.md) | Develop on AirStack from a Mac, Windows, or no-GPU Linux laptop using NVIDIA OSMO + VS Code/Cursor Remote-SSH. No local Docker or local `airstack install`; use a local repo clone for the `airstack osmo:*` wrappers and workflow YAML. | | [Multi-Robot Simulation](multi_robot_simulation.md) | Spin up multiple simulated robots in Isaac Sim and verify independent ROS 2 namespaces. | | [Autonomy Modes](autonomy_modes.md) | Understand `onboard_all`, `onboard_local`, and `offboard_global` modes and the commands to run each. | | [Deploying to Hardware](deploying_to_hardware.md) | Flash a Jetson or VOXL device, configure the robot hostname, and run the autonomy stack on a real drone. | diff --git a/gcs/docker/gcs-base-docker-compose.yaml b/gcs/docker/gcs-base-docker-compose.yaml index c1ec4b3fc..c8ac5200e 100644 --- a/gcs/docker/gcs-base-docker-compose.yaml +++ b/gcs/docker/gcs-base-docker-compose.yaml @@ -6,10 +6,14 @@ services: context: ../ dockerfile: docker/Dockerfile.gcs tags: - - ${PROJECT_DOCKER_REGISTRY}/${PROJECT_NAME}:v${VERSION}_gcs + - &gcs_image ${PROJECT_DOCKER_REGISTRY}/${PROJECT_NAME}:v${VERSION}_gcs + cache_from: + - *gcs_image command: > bash -c " ssh service restart; + python3 /root/AirStack/gcs/foxglove_extensions/install.py; + python3 /root/AirStack/gcs/foxglove_extensions/render_layout.py; tmux new -d -s bringup; if [ $$AUTOLAUNCH = 'true' ]; then tmux send-keys -t bringup:0.0 'bws && sws; ros2 launch desktop_bringup gcs.launch.xml' ENTER; @@ -30,6 +34,8 @@ services: - DISPLAY=${DISPLAY} - QT_X11_NO_MITSHM=1 - NVIDIA_DRIVER_CAPABILITIES=all + # Number of robots (used by action_relay to spawn per-robot relays) + - NUM_ROBOTS=${NUM_ROBOTS:-1} # Record bags - RECORD_BAGS=${RECORD_BAGS} image: ${PROJECT_DOCKER_REGISTRY}/${PROJECT_NAME}:v${VERSION}_gcs @@ -50,6 +56,10 @@ services: - ../../.devcontainer/gcs/launch.json:/root/AirStack/.vscode/launch.json:rw - ../../.devcontainer/gcs/tasks.json:/root/AirStack/.vscode/tasks.json:rw - ./Foxglove:/root/.config/Foxglove:rw + # waypoint/polygon editor saves (so they survive container restarts) + - ../saves:/root/.airstack:rw + # foxglove extensions + - ../foxglove_extensions:/root/AirStack/gcs/foxglove_extensions:rw # autonomy stack stuff - ../../common/ros_packages:/root/AirStack/gcs/ros_ws/src/common:rw - ../../common/fastdds.xml:/root/AirStack/gcs/ros_ws/src/fastdds.xml diff --git a/gcs/foxglove_extensions/airstack_default.json b/gcs/foxglove_extensions/airstack_default.json new file mode 100644 index 000000000..0601abb66 --- /dev/null +++ b/gcs/foxglove_extensions/airstack_default.json @@ -0,0 +1,566 @@ +{ + "configById": { + "Image!2cny33f": { + "cameraState": { + "distance": 20, + "perspective": true, + "phi": 60, + "target": [ + 0, + 0, + 0 + ], + "targetOffset": [ + 0, + 0, + 0 + ], + "targetOrientation": [ + 0, + 0, + 0, + 1 + ], + "thetaOffset": 60, + "fovy": 45, + "near": 0.5, + "far": 5000 + }, + "followMode": "follow-pose", + "scene": {}, + "transforms": {}, + "topics": {}, + "layers": {}, + "publish": { + "type": "point", + "poseTopic": "/move_base_simple/goal", + "pointTopic": "/clicked_point", + "poseEstimateTopic": "/initialpose", + "poseEstimateXDeviation": 0.5, + "poseEstimateYDeviation": 0.5, + "poseEstimateThetaDeviation": 0.26179939 + }, + "synchronize": false, + "imageMode": { + "imageTopic": "/robot_1/sensors/front_stereo/left/image_rect", + "calibrationTopic": "/robot_1/sensors/front_stereo/left/camera_info" + } + }, + "3D!1devslg": { + "cameraState": { + "perspective": true, + "distance": 120.42143843441657, + "phi": 64.70051812782563, + "thetaOffset": 176.95010009206052, + "targetOffset": [ + 0.41991308990838777, + -1.4979048639096184, + -0.186377634467115 + ], + "target": [ + 0, + 0, + 0 + ], + "targetOrientation": [ + 0, + 0, + 0, + 1 + ], + "fovy": 45, + "near": 0.5, + "far": 5000 + }, + "followMode": "follow-none", + "scene": { + "transforms": { + "visible": true, + "editable": true, + "showLabel": true, + "enablePreloading": true, + "offsetReferenceFrame": "fixed-frame" + } + }, + "transforms": { + "frame:map": { + "visible": true + }, + "frame:enu_origin": { + "visible": true + }, + "frame:camera_left": { + "visible": false + }, + "frame:base_link": { + "visible": true + }, + "frame:base_link_body_body_link": { + "visible": false + }, + "frame:base_link_ZED_X": { + "visible": false + }, + "frame:camera_right": { + "visible": false + }, + "frame:imu": { + "visible": false + }, + "frame:OS1_REV6_128_10hz___512_resolution": { + "visible": false + }, + "frame:lidar": { + "visible": false + }, + "frame:rotor0": { + "visible": false + }, + "frame:rotor1": { + "visible": false + }, + "frame:rotor2": { + "visible": false + }, + "frame:rotor3": { + "visible": false + }, + "frame:base_link_frd": { + "visible": false + }, + "frame:map_ned": { + "visible": false + }, + "frame:world": { + "visible": false + }, + "frame:robot_1/base_link": { + "visible": true + }, + "frame:robot_1/robot_pose": { + "visible": true + } + }, + "topics": { + "/gcs/robot_markers": { + "visible": true, + "showOutlines": false, + "namespaces": { + "robot_labels": { + "visible": true + }, + "robot_positions": { + "visible": false + }, + "robot_meshes": { + "visible": true + }, + "robot_1_axes": { + "visible": true + }, + "robot_1_traj": { + "visible": true + }, + "robot_1_vdb": { + "visible": true + }, + "robot_1_global_plan": { + "visible": true + } + } + }, + "/initialpose": { + "visible": false + }, + "/move_base_simple/goal": { + "visible": false + }, + "/gcs/payload_markers": { + "visible": true, + "namespaces": { + "robot_1_filtered_rays": { + "visible": true + } + }, + "showOutlines": true + }, + "/gcs/waypoints/path": { + "visible": true, + "gradient": [ + "#6bffbaff", + "#6bff95ff" + ], + "lineWidth": 0.2, + "type": "axis", + "axisScale": 0.8 + }, + "/gcs/polygon/markers": { + "visible": true, + "color": "#fc0000ff" + }, + "/gcs/waypoints/save_markers": { + "visible": true + }, + "/gcs/polygon/save_markers": { + "visible": true + }, + "/gcs/sim_ground": { + "visible": true, + "showOutlines": true + }, + "/gcs/waypoints/markers": { + "visible": true + }, + "/robot_1/trajectory_controller/trajectory_vis": { + "visible": false + }, + "/robot_1/vdb_mapping/vdb_map_visualization": { + "visible": false + }, + "/robot_1/perception/stereo_image_proc/point_cloud": { + "visible": false + }, + "/robot_1/sensors/front_stereo/left/image_rect": { + "visible": false + }, + "/robot_1/sensors/front_stereo/right/image_rect": { + "visible": false + }, + "/robot_1/global_plan": { + "visible": false + }, + "/gcs/robot_1/frontier_viewpoints": { + "visible": true, + "pointSize": 23.19, + "pointShape": "circle", + "stixelsEnabled": false, + "colorField": "z", + "colorMode": "flat", + "colorMap": "turbo", + "cubeSize": 0.455, + "flatColor": "#ffffffff" + }, + "/gcs/payload/robot_1/filtered_rays": { + "visible": true + }, + "/gcs/payload/robot_1/frontier_viewpoints": { + "visible": true, + "colorField": "z", + "colorMode": "flat", + "colorMap": "turbo", + "pointSize": 13.67, + "flatColor": "#ff0000ff", + "pointShape": "circle" + }, + "/gcs/payload/robot_1/rgb_voxels": { + "visible": true, + "colorField": "rgb", + "colorMode": "rgb", + "colorMap": "turbo", + "pointShape": "cube", + "cubeSize": 0.5 + }, + "/gcs/payload/robot_1/voxel_rgb": { + "visible": true, + "showOutlines": true, + "colorField": "rgb", + "colorMode": "rgb", + "colorMap": "turbo", + "pointSize": 10 + }, + "/robot_1/filtered_rays": { + "visible": false + }, + "/gcs/payload/robot_1/raw_frontiers": { + "visible": false, + "colorField": "z", + "colorMode": "flat", + "colorMap": "turbo", + "pointSize": 9, + "flatColor": "#fd0000ff" + } + }, + "layers": { + "grid": { + "visible": false, + "drawBehind": false, + "label": "Grid", + "instanceId": "274158a6-8bbc-401d-8a37-c80da88ac0bc", + "layerId": "foxglove.Grid", + "size": 10, + "divisions": 10, + "lineWidth": 1, + "color": "#248eff", + "position": [ + 0, + 0, + 0 + ], + "rotation": [ + 0, + 0, + 0 + ] + }, + "3dd9f120-6d9b-43cd-a7c7-d1ff6e1263d8": { + "visible": false, + "serverConfig": "satellite", + "instanceId": "3dd9f120-6d9b-43cd-a7c7-d1ff6e1263d8", + "layerId": "foxglove.TiledMap", + "opacity": 1, + "drawBehind": false, + "order": 1, + "zPosition": -1 + } + }, + "publish": { + "type": "point", + "poseTopic": "/move_base_simple/goal", + "pointTopic": "/clicked_point", + "poseEstimateTopic": "/initialpose", + "poseEstimateXDeviation": 0.5, + "poseEstimateYDeviation": 0.5, + "poseEstimateThetaDeviation": 0.26179939 + }, + "imageMode": {}, + "synchronize": false, + "followTf": "map", + "locationFixTopic": "/gcs/map_origin/location" + }, + "Tab!3utzhs8": { + "activeTabIdx": 0, + "tabs": [ + { + "title": "robot 1", + "layout": { + "first": { + "first": "robot-commands.Robot Tasks!2mqrmqp_r1", + "second": "Image!35kacl0_r1", + "direction": "row", + "splitPercentage": 50.27943253700179 + }, + "second": "Image!2lvzh2s_r1", + "direction": "row", + "splitPercentage": 67.05514145698636 + } + } + ] + }, + "Image!35kacl0_r1": { + "cameraState": { + "distance": 20, + "perspective": true, + "phi": 60, + "target": [ + 0, + 0, + 0 + ], + "targetOffset": [ + 0, + 0, + 0 + ], + "targetOrientation": [ + 0, + 0, + 0, + 1 + ], + "thetaOffset": 60, + "fovy": 45, + "near": 0.5, + "far": 5000 + }, + "followMode": "follow-pose", + "scene": {}, + "transforms": {}, + "topics": {}, + "layers": {}, + "publish": { + "type": "point", + "poseTopic": "/move_base_simple/goal", + "pointTopic": "/clicked_point", + "poseEstimateTopic": "/initialpose", + "poseEstimateXDeviation": 0.5, + "poseEstimateYDeviation": 0.5, + "poseEstimateThetaDeviation": 0.26179939 + }, + "imageMode": { + "imageTopic": "/robot_1/sensors/front_stereo/right/image_rect", + "rectifyImage": true, + "flipHorizontal": false, + "rotation": 0, + "calibrationTopic": "/robot_1/sensors/front_stereo/right/camera_info" + }, + "synchronize": false + }, + "robot-commands.Robot Tasks!2mqrmqp_r1": { + "robot": "robot_1", + "activeTab": "takeoff", + "takeoff": { + "target_altitude_m": 10, + "velocity_m_s": 1 + }, + "fixed_trajectory": { + "type": "Figure8", + "attributes": [ + [ + "frame_id", + "base_link" + ], + [ + "length", + "4" + ], + [ + "width", + "2" + ], + [ + "height", + "0.0" + ], + [ + "velocity", + "2.0" + ], + [ + "max_acceleration", + "1.0" + ] + ], + "loop": false + }, + "land": { + "velocity_m_s": 0.3 + }, + "navigate": { + "frame_id": "map", + "waypoints": "[[-7.59,5.71,10],[-14.63,-1.25,10]]", + "goal_tolerance_m": 1.5, + "use_editor": true, + "_waypoints_open": true + }, + "exploration": { + "search_bounds": "[[29.75,-134.55,0],[18.06,-49.89,0],[-50.63,-69.34,0],[-33.74,-124.81,0]]", + "min_altitude_agl": 8, + "max_altitude_agl": 16, + "min_flight_speed": 2.5, + "max_flight_speed": 4, + "time_limit_sec": 180, + "use_editor": true, + "_search_bounds_open": true + }, + "coverage": { + "coverage_area": "[]", + "min_altitude_agl": 10, + "max_altitude_agl": 16, + "min_flight_speed": 1, + "max_flight_speed": 3, + "line_spacing_m": 5, + "heading_deg": 0, + "use_editor": true + }, + "object_search": { + "object_class": "", + "search_area": "[[15.32,20.76,0],[15.25,-0.93,0],[31,-1.23,0],[31.29,19.46,0]]", + "min_altitude_agl": 3, + "max_altitude_agl": 10, + "min_flight_speed": 1, + "max_flight_speed": 3, + "time_limit_sec": 120, + "target_count": 1, + "use_editor": true, + "_search_area_open": true + }, + "object_counting": { + "object_class": "", + "count_area": "[[15.32,20.76,0],[15.25,-0.93,0],[31,-1.23,0],[31.29,19.46,0]]", + "min_altitude_agl": 3, + "max_altitude_agl": 10, + "min_flight_speed": 1, + "max_flight_speed": 3, + "use_editor": true, + "_count_area_open": true + }, + "semantic_search": { + "query": "radio tower, water tower", + "background_queries": "sky, ground, grass, road", + "search_area": "[[100.84,99.33,0],[100.25,-169.11,0],[-105.63,-203.46,0],[-99,99.07,0]]", + "min_altitude_agl": 3, + "max_altitude_agl": 15, + "min_flight_speed": 0.2, + "max_flight_speed": 0.5, + "confidence_threshold": 0.95, + "use_editor": true, + "_search_area_open": true + } + }, + "Image!2lvzh2s_r1": { + "cameraState": { + "distance": 20, + "perspective": true, + "phi": 60, + "target": [ + 0, + 0, + 0 + ], + "targetOffset": [ + 0, + 0, + 0 + ], + "targetOrientation": [ + 0, + 0, + 0, + 1 + ], + "thetaOffset": 60, + "fovy": 45, + "near": 0.5, + "far": 5000 + }, + "followMode": "follow-pose", + "scene": {}, + "transforms": {}, + "topics": {}, + "layers": {}, + "publish": { + "type": "point", + "poseTopic": "/move_base_simple/goal", + "pointTopic": "/clicked_point", + "poseEstimateTopic": "/initialpose", + "poseEstimateXDeviation": 0.5, + "poseEstimateYDeviation": 0.5, + "poseEstimateThetaDeviation": 0.26179939 + }, + "synchronize": false, + "imageMode": { + "imageTopic": "/robot_1/sensors/front_stereo/right/depth_ground_truth", + "calibrationTopic": "/robot_1/sensors/front_stereo/right/camera_info", + "colorMode": "gradient", + "minValue": 0, + "maxValue": 100 + } + } + }, + "globalVariables": {}, + "userNodes": {}, + "playbackConfig": { + "speed": 1 + }, + "layout": { + "direction": "row", + "first": "Image!2cny33f", + "second": { + "first": "3D!1devslg", + "second": "Tab!3utzhs8", + "direction": "column", + "splitPercentage": 53.09235777097605 + }, + "splitPercentage": 0 + } +} \ No newline at end of file diff --git a/gcs/foxglove_extensions/install.py b/gcs/foxglove_extensions/install.py new file mode 100644 index 000000000..fc28de102 --- /dev/null +++ b/gcs/foxglove_extensions/install.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +""" +Install AirStack Foxglove extensions into a Foxglove user-extensions dir. + +By default this targets the GCS container's bundled Foxglove app +(/root/.foxglove-studio/extensions), which is the entrypoint that +gcs/docker/gcs-base-docker-compose.yaml runs on container start. + +The src/dst paths can be overridden via env vars, which is how the +`airstack osmo:foxglove` wrapper reuses this same script to install the +extensions into the laptop's local Foxglove Desktop app before +port-forwarding the GCS bridge — that way the laptop's Foxglove sees +"Robot Tasks" / "Waypoint Editor" / "Polygon Editor" instead of the +"Unknown panel type: ..." placeholders. + +Env vars: + FOXGLOVE_EXT_SRC directory containing the extension subdirectories + (each with a package.json + dist/extension.js) + FOXGLOVE_EXT_DST target user-extensions directory, e.g. + ~/.foxglove-studio/extensions on Linux/macOS. +""" + +import json +import os +import re +import shutil + +src = os.environ.get( + 'FOXGLOVE_EXT_SRC', '/root/AirStack/gcs/foxglove_extensions') +dst = os.path.expanduser(os.environ.get( + 'FOXGLOVE_EXT_DST', '/root/.foxglove-studio/extensions')) +os.makedirs(dst, exist_ok=True) + + +def _slug(s: str) -> str: + return re.sub(r'[^a-z0-9-]+', '-', s.lower()).strip('-') + + +installed = 0 +for ext in sorted(os.listdir(src)): + pkg_path = os.path.join(src, ext, 'package.json') + if not os.path.exists(pkg_path): + continue + pkg = json.load(open(pkg_path)) + name = '{}.{}-{}'.format(_slug(pkg['publisher']), pkg['name'], pkg['version']) + shutil.copytree(os.path.join(src, ext), os.path.join(dst, name), dirs_exist_ok=True) + print('Installed Foxglove extension:', name, '->', os.path.join(dst, name)) + installed += 1 + +if installed == 0: + print('No Foxglove extensions found under', src) diff --git a/gcs/foxglove_extensions/polygon-editor/dist/extension.js b/gcs/foxglove_extensions/polygon-editor/dist/extension.js new file mode 100644 index 000000000..607a3edfa --- /dev/null +++ b/gcs/foxglove_extensions/polygon-editor/dist/extension.js @@ -0,0 +1,373 @@ +(() => { +"use strict"; + +// ─────────────────────────── constants ──────────────────────────────────────── + +const CMD_TOPIC = "/gcs/polygon/command"; +const LIST_TOPIC = "/gcs/polygon/list"; +const SAVES_TOPIC = "/gcs/polygon/saves"; +const CMD_SCHEMA = "std_msgs/msg/String"; + +// ─────────────────────────── panel ─────────────────────────────────────────── + +function activate(extensionContext) { + extensionContext.registerPanel({ + name: "Polygon Editor", + initPanel: (panelContext) => { + + // ── state ──────────────────────────────────────────────────────── + const persisted = panelContext.initialState ?? {}; + const state = { + defaultZ: persisted.defaultZ ?? 0.0, + }; + let vertices = []; // [{x, y, z}, ...] + let selectedIdx = -1; + let enabled = false; // synced from /gcs/polygon/list + let saves = []; // synced from /gcs/polygon/saves + + function persist() { panelContext.saveState(state); } + function sendCmd(cmd) { + try { + panelContext.advertise(CMD_TOPIC, CMD_SCHEMA); + panelContext.publish(CMD_TOPIC, { data: JSON.stringify(cmd) }); + } catch (err) { + statusEl.textContent = "Cmd failed: " + (err?.message ?? err); + } + } + + // ── DOM ────────────────────────────────────────────────────────── + const root = panelContext.panelElement; + root.style.cssText = + "display:flex;flex-direction:column;height:100%;box-sizing:border-box;" + + "padding:8px;gap:6px;font-family:sans-serif;color:inherit;overflow-y:auto;"; + + // Title row + const titleRow = el("div", "display:flex;align-items:center;gap:8px;flex-shrink:0;"); + const title = el("span", "font-weight:bold;font-size:14px;flex:1;"); + title.textContent = "Polygon Editor"; + titleRow.appendChild(title); + root.appendChild(titleRow); + + // Altitude + clear row + const ctrlRow = el("div", "display:flex;align-items:center;gap:8px;flex-shrink:0;"); + + const enableBtn = el("button", + "padding:8px 18px;border-radius:5px;border:none;color:white;cursor:pointer;font-weight:bold;font-size:15px;"); + enableBtn.addEventListener("click", () => { + sendCmd({ action: "set_enabled", enabled: !enabled }); + }); + ctrlRow.appendChild(enableBtn); + + // Z is intentionally not exposed for polygons — vertices always carry + // z=0 server-side; the consumers treat polygons as 2D footprints. + + const clearBtn = el("button", + "padding:8px 14px;border-radius:5px;border:none;background:#dc2626;color:white;cursor:pointer;font-size:14px;"); + clearBtn.textContent = "Clear All"; + clearBtn.addEventListener("click", () => { sendCmd({ action: "clear" }); }); + + ctrlRow.appendChild(el("span", "flex:1;")); // spacer + ctrlRow.appendChild(clearBtn); + root.appendChild(ctrlRow); + + // Vertex count + const countEl = el("div", "font-size:12px;opacity:0.8;flex-shrink:0;"); + root.appendChild(countEl); + + // Vertex list container + const listContainer = el("div", + "flex:1;overflow-y:auto;border:1px solid #444;border-radius:4px;min-height:60px;"); + root.appendChild(listContainer); + + // Add vertex manually row (X/Y only — Z is fixed to 0 for polygons) + const addRow = el("div", "display:flex;align-items:center;gap:4px;flex-shrink:0;"); + const addXIn = numInput("X", "0"); + const addYIn = numInput("Y", "0"); + const addBtn = el("button", + "padding:6px 14px;border-radius:5px;border:none;background:#10b981;color:white;cursor:pointer;font-size:14px;"); + addBtn.textContent = "+ Add"; + addBtn.addEventListener("click", () => { + sendCmd({ + action: "add", + x: Number(addXIn.input.value) || 0, + y: Number(addYIn.input.value) || 0, + z: 0, + }); + }); + addRow.appendChild(addXIn.wrap); + addRow.appendChild(addYIn.wrap); + addRow.appendChild(addBtn); + root.appendChild(addRow); + + // ── Saves section ──────────────────────────────────────────────── + const savesLabel = el("div", + "font-size:11px;font-weight:bold;opacity:0.8;margin-top:6px;flex-shrink:0;"); + savesLabel.textContent = "Saves"; + root.appendChild(savesLabel); + + const saveAddRow = el("div", "display:flex;align-items:center;gap:4px;flex-shrink:0;"); + const saveNameIn = el("input", + "flex:1;padding:6px 8px;border-radius:5px;border:1px solid #555;background:transparent;color:inherit;font-size:14px;"); + saveNameIn.type = "text"; + saveNameIn.placeholder = "save name…"; + const saveAddBtn = el("button", + "padding:6px 14px;border-radius:5px;border:none;background:#10b981;color:white;cursor:pointer;font-size:14px;"); + saveAddBtn.textContent = "+ Add"; + saveAddBtn.addEventListener("click", () => { + const name = saveNameIn.value.trim(); + if (!name) return; + sendCmd({ action: "add_save", name }); + }); + saveAddRow.appendChild(saveNameIn); + saveAddRow.appendChild(saveAddBtn); + root.appendChild(saveAddRow); + + const saveList = el("div", + "border:1px solid #333;border-radius:4px;min-height:0;flex-shrink:0;"); + root.appendChild(saveList); + + // Status + const statusEl = el("div", "font-size:11px;opacity:0.6;flex-shrink:0;"); + root.appendChild(statusEl); + + function renderSaves() { + saveList.replaceChildren(); + if (saves.length === 0) { + const empty = el("div", "padding:6px 8px;opacity:0.5;font-size:11px;"); + empty.textContent = "No saves yet. Type a name and click + Add."; + saveList.appendChild(empty); + return; + } + for (const s of saves) { + const row = el("div", + "display:flex;align-items:center;gap:8px;padding:6px 8px;border-bottom:1px solid #333;font-size:14px;"); + const swatch = el("span", + "display:inline-block;width:10px;height:10px;border-radius:50%;flex-shrink:0;"); + const [r, g, b] = s.color || [0.5, 0.5, 0.5]; + swatch.style.background = + `rgb(${Math.round(r*255)},${Math.round(g*255)},${Math.round(b*255)})`; + row.appendChild(swatch); + + const nameEl = el("span", "flex:1;font-family:monospace;"); + nameEl.textContent = `${s.name} (${s.count} vert${s.count === 1 ? "" : "s"})`; + row.appendChild(nameEl); + + const loadBtn = el("button", + "padding:4px 10px;border-radius:4px;border:1px solid #555;background:transparent;color:inherit;cursor:pointer;font-size:13px;"); + loadBtn.textContent = "Load"; + loadBtn.addEventListener("click", () => { + sendCmd({ action: "load_save", name: s.name }); + }); + row.appendChild(loadBtn); + + const saveBtn = el("button", ""); + if (s.saved) { + saveBtn.textContent = "✓ Saved"; + saveBtn.disabled = true; + saveBtn.style.cssText = + "padding:4px 10px;border-radius:4px;border:1px solid #10b981;background:rgba(16,185,129,0.1);color:#10b981;cursor:default;font-size:13px;"; + } else { + saveBtn.textContent = "Save"; + saveBtn.style.cssText = + "padding:4px 10px;border-radius:4px;border:none;background:#2563eb;color:white;cursor:pointer;font-size:13px;"; + saveBtn.addEventListener("click", () => { + sendCmd({ action: "save_save", name: s.name }); + }); + } + row.appendChild(saveBtn); + + const delBtn = el("button", + "padding:4px 8px;border-radius:4px;border:none;background:transparent;color:#dc2626;cursor:pointer;font-size:15px;"); + delBtn.textContent = "✕"; + delBtn.title = "Delete save"; + delBtn.addEventListener("click", () => { + sendCmd({ action: "delete_save", name: s.name }); + }); + row.appendChild(delBtn); + + saveList.appendChild(row); + } + } + renderSaves(); + + // ── render helpers ─────────────────────────────────────────────── + function renderEnableBtn() { + if (enabled) { + enableBtn.textContent = "Capture: ON"; + enableBtn.style.background = "#10b981"; + } else { + enableBtn.textContent = "Capture: OFF"; + enableBtn.style.background = "#dc2626"; + } + } + renderEnableBtn(); + + function renderList() { + countEl.textContent = vertices.length + " vert" + (vertices.length === 1 ? "ex" : "ices"); + listContainer.replaceChildren(); + + if (vertices.length === 0) { + const empty = el("div", "padding:12px;text-align:center;opacity:0.5;font-size:12px;"); + empty.textContent = "No vertices. Enable capture and click in the 3D view, or add manually."; + listContainer.appendChild(empty); + return; + } + + for (let i = 0; i < vertices.length; i++) { + const v = vertices[i]; + const row = el("div", + "display:flex;align-items:center;gap:5px;padding:5px 8px;" + + "border-bottom:1px solid #333;font-size:14px;font-family:monospace;" + + (i === selectedIdx ? "background:rgba(220,38,38,0.18);" : "")); + + const idx = el("span", "width:24px;font-weight:bold;font-size:14px;color:#dc2626;"); + idx.textContent = String(i); + row.appendChild(idx); + + const xIn = coordInput(v.x, (val) => { + sendCmd({ action: "move", index: i, x: val, y: v.y, z: 0 }); + }); + const yIn = coordInput(v.y, (val) => { + sendCmd({ action: "move", index: i, x: v.x, y: val, z: 0 }); + }); + row.appendChild(xIn); + row.appendChild(yIn); + + if (i > 0) { + const upBtn = smallBtn("▲", () => { + sendCmd({ action: "reorder", from: i, to: i - 1 }); + }); + upBtn.title = "Move up"; + row.appendChild(upBtn); + } else { + row.appendChild(el("span", "width:32px;")); + } + + if (i < vertices.length - 1) { + const downBtn = smallBtn("▼", () => { + sendCmd({ action: "reorder", from: i, to: i + 1 }); + }); + downBtn.title = "Move down"; + row.appendChild(downBtn); + } else { + row.appendChild(el("span", "width:32px;")); + } + + const dupBtn = smallBtn("⧉", () => { + sendCmd({ action: "duplicate", index: i }); + }); + dupBtn.title = "Duplicate"; + row.appendChild(dupBtn); + + const delBtn = smallBtn("✕", () => { + sendCmd({ action: "delete", index: i }); + }); + delBtn.style.color = "#dc2626"; + delBtn.title = "Delete"; + row.appendChild(delBtn); + + row.addEventListener("click", (e) => { + if (e.target.tagName === "INPUT" || e.target.tagName === "BUTTON") return; + selectedIdx = (selectedIdx === i) ? -1 : i; + renderList(); + }); + + listContainer.appendChild(row); + } + } + renderList(); + + // ── subscriptions ──────────────────────────────────────────────── + panelContext.subscribe([ + { topic: LIST_TOPIC }, + { topic: SAVES_TOPIC }, + ]); + panelContext.watch("currentFrame"); + + panelContext.onRender = (renderState, done) => { + const frame = renderState.currentFrame; + if (frame) { + for (const evt of frame) { + if (evt.topic === LIST_TOPIC) { + try { + const data = JSON.parse(evt.message.data); + vertices = data.vertices ?? []; + if (data.default_z != null) { + state.defaultZ = data.default_z; + altInput.value = String(data.default_z); + } + if (data.enabled != null) { + enabled = Boolean(data.enabled); + renderEnableBtn(); + } + if (selectedIdx >= vertices.length) selectedIdx = -1; + renderList(); + } catch { /* ignore bad data */ } + } else if (evt.topic === SAVES_TOPIC) { + try { + const data = JSON.parse(evt.message.data); + saves = Array.isArray(data.saves) ? data.saves : []; + renderSaves(); + } catch { /* ignore bad data */ } + } + } + } + done(); + }; + + panelContext.setDefaultPanelTitle("Polygon Editor"); + + return () => {}; + }, + }); +} + +// ─────────────────────────── DOM helpers ───────────────────────────────────── + +function el(tag, style) { + const e = document.createElement(tag); + if (style) e.style.cssText = style; + return e; +} + +function smallBtn(text, onClick) { + const b = el("button", + "width:32px;height:32px;padding:0;border:none;background:transparent;" + + "color:inherit;cursor:pointer;font-size:15px;border-radius:4px;"); + b.textContent = text; + b.addEventListener("mouseenter", () => { b.style.background = "rgba(255,255,255,0.1)"; }); + b.addEventListener("mouseleave", () => { b.style.background = "transparent"; }); + b.addEventListener("click", (e) => { e.stopPropagation(); onClick(); }); + return b; +} + +function coordInput(value, onChange) { + const inp = el("input", + "width:72px;padding:5px 7px;border-radius:4px;border:1px solid #555;" + + "background:transparent;color:inherit;font-family:monospace;font-size:13px;text-align:right;"); + inp.type = "number"; + inp.step = "0.5"; + inp.value = String(value); + inp.addEventListener("change", () => { + onChange(Number(inp.value) || 0); + }); + return inp; +} + +function numInput(label, defaultVal) { + const wrap = el("div", "display:flex;align-items:center;gap:4px;"); + const lbl = el("span", "font-size:13px;opacity:0.75;"); + lbl.textContent = label + ":"; + const input = el("input", + "width:64px;padding:5px 7px;border-radius:4px;border:1px solid #555;" + + "background:transparent;color:inherit;font-size:13px;"); + input.type = "number"; + input.step = "0.5"; + input.value = defaultVal; + wrap.appendChild(lbl); + wrap.appendChild(input); + return { wrap, input }; +} + +module.exports = { activate }; +})(); diff --git a/gcs/foxglove_extensions/polygon-editor/package.json b/gcs/foxglove_extensions/polygon-editor/package.json new file mode 100644 index 000000000..780c75f64 --- /dev/null +++ b/gcs/foxglove_extensions/polygon-editor/package.json @@ -0,0 +1,9 @@ +{ + "name": "polygon-editor", + "displayName": "Polygon Editor", + "description": "Click-to-place polygon vertex editor for the Foxglove 3D view.", + "publisher": "AirLab CMU", + "version": "1.0.0", + "license": "Apache-2.0", + "main": "./dist/extension.js" +} diff --git a/gcs/foxglove_extensions/render_layout.py b/gcs/foxglove_extensions/render_layout.py new file mode 100644 index 000000000..103736aa9 --- /dev/null +++ b/gcs/foxglove_extensions/render_layout.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +"""Render airstack_layout_custom.json with NUM_ROBOTS tabs from a single-robot template. + +Foxglove layout JSON has no native templating, so we generate it at GCS startup +based on the NUM_ROBOTS env var. Tab[0] of the input file is treated as the +canonical robot_1 template; we replicate it for robots 1..NUM_ROBOTS, mint +unique panel IDs per tab, and patch the 3D panel's per-robot transforms / +topics / namespaces to cover the same range. +""" + +import argparse +import copy +import json +import os +import re + +PANEL_ID_RE = re.compile(r'^([A-Za-z0-9_.\- ]+)!(\w+)$') +ROBOT_KEY_RE = re.compile(r'^(.*?)robot_(\d+)(.*)$') +# Strip every trailing `_r` suffix this script previously appended so +# re-running over its own output doesn't stack (e.g. `_r1_r1_r1...`). +PANEL_SUFFIX_RE = re.compile(r'(_r\d+)+$') + + +def replace_robot_n(obj, src_n: int, dst_n: int): + """Deep-replace robot_{src_n} → robot_{dst_n} in strings and dict keys. + Also handles the human-readable 'robot {N}' tab title form.""" + src_us, dst_us = f'robot_{src_n}', f'robot_{dst_n}' + src_sp, dst_sp = f'robot {src_n}', f'robot {dst_n}' + + def _swap(s: str) -> str: + return s.replace(src_us, dst_us).replace(src_sp, dst_sp) + + def _do(o): + if isinstance(o, dict): + return {(_swap(k) if isinstance(k, str) else k): _do(v) + for k, v in o.items()} + if isinstance(o, list): + return [_do(v) for v in o] + if isinstance(o, str): + return _swap(o) + return o + + return _do(obj) + + +def find_panel_ids(obj, ids=None): + """Collect every panel-ID string (`Pkg!suffix`) appearing in a layout tree.""" + if ids is None: + ids = set() + if isinstance(obj, str): + if PANEL_ID_RE.fullmatch(obj): + ids.add(obj) + elif isinstance(obj, dict): + for v in obj.values(): + find_panel_ids(v, ids) + elif isinstance(obj, list): + for v in obj: + find_panel_ids(v, ids) + return ids + + +def remap_panel_ids(obj, mapping): + if isinstance(obj, str): + return mapping.get(obj, obj) + if isinstance(obj, dict): + return {k: remap_panel_ids(v, mapping) for k, v in obj.items()} + if isinstance(obj, list): + return [remap_panel_ids(v, mapping) for v in obj] + return obj + + +def _mint_id(pid: str, n: int) -> str: + m = PANEL_ID_RE.fullmatch(pid) + if not m: + return pid + base = PANEL_SUFFIX_RE.sub('', m.group(2)) + return f'{m.group(1)}!{base}_r{n}' + + +def _expand_per_robot(obj, num_robots: int) -> None: + """Walk obj in-place. For every dict whose keys match + `robot_`, drop existing per-robot keys and re-add + cloned ones for N=1..num_robots, using the lowest-N entry as the template.""" + if isinstance(obj, list): + for v in obj: + _expand_per_robot(v, num_robots) + return + if not isinstance(obj, dict): + return + + by_base = {} + keys_to_drop = [] + for k, v in obj.items(): + m = ROBOT_KEY_RE.match(k) + if m: + base = (m.group(1), m.group(3)) + n = int(m.group(2)) + cur = by_base.get(base) + if cur is None or n < cur[0]: + by_base[base] = (n, v) + keys_to_drop.append(k) + + for k in keys_to_drop: + obj.pop(k, None) + + for (prefix, suffix), (src_n, template_value) in by_base.items(): + for dst_n in range(1, num_robots + 1): + new_key = f'{prefix}robot_{dst_n}{suffix}' + obj[new_key] = replace_robot_n(template_value, src_n, dst_n) + + for v in obj.values(): + _expand_per_robot(v, num_robots) + + +def expand_layout(template_json: dict, num_robots: int) -> dict: + out = copy.deepcopy(template_json) + config_by_id = out['configById'] + + tab_key = next( + (k for k, v in config_by_id.items() + if k.startswith('Tab!') and isinstance(v, dict) and 'tabs' in v), + None) + if tab_key is None: + raise SystemExit('No Tab!* container with tabs[] found in configById') + + tab_container = config_by_id[tab_key] + tabs = tab_container.get('tabs', []) + if not tabs: + raise SystemExit('Tab container has no tabs to use as template') + + template_tab = copy.deepcopy(tabs[0]) + template_panel_ids = find_panel_ids(template_tab.get('layout', {})) + template_configs = {pid: copy.deepcopy(config_by_id[pid]) + for pid in template_panel_ids if pid in config_by_id} + + old_panel_ids = set() + for t in tabs: + old_panel_ids |= find_panel_ids(t.get('layout', {})) + for pid in old_panel_ids: + config_by_id.pop(pid, None) + + new_tabs = [] + for n in range(1, num_robots + 1): + cloned_tab = replace_robot_n(template_tab, 1, n) + id_map = {pid: _mint_id(pid, n) for pid in template_panel_ids} + cloned_tab['layout'] = remap_panel_ids(cloned_tab['layout'], id_map) + new_tabs.append(cloned_tab) + for old_pid, new_pid in id_map.items(): + if old_pid in template_configs: + config_by_id[new_pid] = replace_robot_n( + template_configs[old_pid], 1, n) + + tab_container['tabs'] = new_tabs + if tab_container.get('activeTabIdx', 0) >= num_robots: + tab_container['activeTabIdx'] = 0 + + for k, v in config_by_id.items(): + if k.startswith('3D!') and isinstance(v, dict): + _expand_per_robot(v, num_robots) + + return out + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument('--input', help='Source template JSON (LAYOUT_TEMPLATE env)', + default=os.environ.get( + 'LAYOUT_TEMPLATE', + '/root/AirStack/gcs/foxglove_extensions/airstack_default.json')) + ap.add_argument('--output', help='Rendered output (LAYOUT_OUTPUT env). Defaults to ' + '/root/airstack_layout_num_robots_.json so the file appears in ' + 'Foxglove\'s "Import Layout" file browser.', + default=os.environ.get('LAYOUT_OUTPUT')) + ap.add_argument('--num-robots', type=int, + default=int(os.environ.get('NUM_ROBOTS', '1'))) + args = ap.parse_args() + if args.output is None: + args.output = f'/root/airstack_layout_num_robots_{args.num_robots}.json' + + with open(args.input) as f: + template = json.load(f) + rendered = expand_layout(template, args.num_robots) + + os.makedirs(os.path.dirname(args.output), exist_ok=True) + tmp = args.output + '.tmp' + with open(tmp, 'w') as f: + json.dump(rendered, f, indent=2) + os.replace(tmp, args.output) + print(f'rendered {args.num_robots}-robot layout → {args.output}') + + +if __name__ == '__main__': + main() diff --git a/gcs/foxglove_extensions/robot-commands.foxe b/gcs/foxglove_extensions/robot-commands.foxe new file mode 100644 index 000000000..f14869ba2 Binary files /dev/null and b/gcs/foxglove_extensions/robot-commands.foxe differ diff --git a/gcs/foxglove_extensions/robot-commands/dist/extension.js b/gcs/foxglove_extensions/robot-commands/dist/extension.js new file mode 100644 index 000000000..523ce151e --- /dev/null +++ b/gcs/foxglove_extensions/robot-commands/dist/extension.js @@ -0,0 +1,1575 @@ +(() => { +"use strict"; + +// ─────────────────────────── constants ──────────────────────────────────────── + +const GOAL_STATUS = { + UNKNOWN: 0, + ACCEPTED: 1, + EXECUTING: 2, + CANCELING: 3, + SUCCEEDED: 4, + CANCELED: 5, + ABORTED: 6, +}; + +const TERMINAL_STATUSES = new Set([ + GOAL_STATUS.SUCCEEDED, + GOAL_STATUS.CANCELED, + GOAL_STATUS.ABORTED, +]); + +// Topic the Waypoint Editor publishes its current list on (std_msgs/String JSON). +const EDITOR_LIST_TOPIC = "/gcs/waypoints/list"; +const EDITOR_SAVES_TOPIC = "/gcs/waypoints/saves"; + +// Topic the Polygon Editor publishes its current vertex list on. +const POLYGON_LIST_TOPIC = "/gcs/polygon/list"; +const POLYGON_SAVES_TOPIC = "/gcs/polygon/saves"; + +// Module-level caches of latest editor data, refreshed in the panel's onRender. +// The "Grab from Editor" button on each polygon/path field copies these into +// the textbox. +let editorWaypointsCache = []; +let polygonVerticesCache = []; +// Save caches: {name: {color, vertices}} keyed by save name. +let editorSavesCache = {}; +let polygonSavesCache = {}; +// Capture-toggle + default-altitude state per editor (synced from list topics). +let editorEnabled = false; +let polygonEnabled = false; +let editorDefaultZ = 10.0; +let polygonDefaultZ = 0.0; + +// Registry of (kind,