diff --git a/.agents/.gitignore b/.agents/.gitignore new file mode 100644 index 000000000000..4c41cd39e6d2 --- /dev/null +++ b/.agents/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!skills/ diff --git a/.agents/skills/.gitignore b/.agents/skills/.gitignore new file mode 100644 index 000000000000..d6b7ef32c847 --- /dev/null +++ b/.agents/skills/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/.claude/.gitignore b/.claude/.gitignore new file mode 100644 index 000000000000..3f0760f9ec16 --- /dev/null +++ b/.claude/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!skills diff --git a/.claude/skills b/.claude/skills new file mode 120000 index 000000000000..5547b0582a68 --- /dev/null +++ b/.claude/skills @@ -0,0 +1 @@ +/home/maru/.claude/personal-skills \ No newline at end of file diff --git a/.envrc b/.envrc index f7d1210c8558..877f4498c123 100644 --- a/.envrc +++ b/.envrc @@ -38,3 +38,23 @@ source_env_if_exists .envrc.local export ENVRC_GIT_REPO=avalanchego # Enables configuring the global behavior by repo export ENVRC_PROJECT_DIR="$PWD" # Enables targeting the current project directory [[ -n "$GLOBAL_ENVRC" ]] && source_env_if_exists "$GLOBAL_ENVRC" + +ensure_repo_agent_symlink_if_absent() { + local source_path="$1" + local target_path="$2" + + if [[ ! -f "$source_path" ]]; then + return + fi + + if [[ -e "$target_path" || -L "$target_path" ]]; then + return + fi + + ln -s "$source_path" "$target_path" +} + +# Provide project defaults for agent context files unless a user has already +# supplied their own via local files or global envrc customization. +ensure_repo_agent_symlink_if_absent "agents/AGENTS.md" "AGENTS.md" +ensure_repo_agent_symlink_if_absent "agents/AGENTS.md" "CLAUDE.md" diff --git a/.gitignore b/.gitignore index cc8e9a3b6124..6d9d62f49daa 100644 --- a/.gitignore +++ b/.gitignore @@ -65,9 +65,27 @@ vendor # Personal extension to .envrc .envrc.local +# Generated by .envrc as agent entrypoints when absent. +/AGENTS.md +/CLAUDE.md +/CLAUDE.local.md + # debug files __debug_* # polyrepo polyrepo.log /firewood/ + +# Local agent config and skills can be symlinked in without being tracked. +!.agents/ +.agents/* +!.agents/.gitignore +!.agents/skills/ +.agents/skills/* +!.agents/skills/.gitignore + +!.claude/ +.claude/* +!.claude/.gitignore +!.claude/skills diff --git a/.review-briefs/README.md b/.review-briefs/README.md new file mode 100644 index 000000000000..85b5bd351a90 --- /dev/null +++ b/.review-briefs/README.md @@ -0,0 +1,236 @@ +# PR review briefs + +This directory holds **PR review briefs**: lightweight, PR-scoped documents in +`.review-briefs/` that give reviewers a holistic overview of what a PR is trying to do and why. + +They are the versioned, in-tree long-form review companion for a PR. + +## At a glance + +A PR review brief is the versioned, in-tree long-form review companion for a PR. + +Use one when the diff and PR description alone are not enough for a reviewer to quickly form the +right mental model. + +A good review brief should: +- explain what the PR is doing and why +- make scope and non-goals explicit +- capture the key concepts, assumptions, or invariants needed for review +- say how the PR should be validated +- point reviewers at the parts of the diff that deserve the most attention + +If a PR has a review brief, link it from the PR description. + +Think of these as: +- the evolving review companion for a PR +- a versioned long-form review overview that can change with the code +- more focused than long-lived design docs in `docs/` +- useful to the author, reviewers, and agent-assisted review + +They may be authored and maintained by humans, agents, or both. + +## Relation to the PR description + +The PR description is usually the entry point for review. + +The review brief is the preferred versioned long-form review companion when a PR benefits from one. + +The PR description can stay relatively static and high-level, for example: +- a short summary of the PR +- links to issues, related PRs, or external context +- any GitHub-specific review metadata + +Once pointed to it, reviewers should be able to rely on the review brief for the holistic +explanation of the PR. The code should still be readable on its own. + +For readers familiar with Gerrit's commit-message-file workflow, a PR review brief serves a +similar purpose: it provides an in-tree, versioned place for reviewer-facing context that evolves +with the change. The difference is that a review brief is intentionally longer-form and can cover +scope, non-goals, validation, and review focus, not just the change summary. + +## Reviewer expectations + +Authors should aim to keep the review brief aligned with the current state of the PR. + +Reviewers should be able to read the brief first and then use it to evaluate whether the diff +matches the stated intent. + +Reviewers should be able to use the brief to answer: +- what is this PR trying to do and why? +- what is in scope, and what is intentionally not? +- what assumptions or invariants matter when reading the diff? +- how was this validated? +- where should review attention go? + +The brief should match the code with no important omissions or contradictions. If part of the brief +has become stable project guidance, consider promoting it into durable docs. + +## Why this exists + +Some context is important during implementation and review, but doesn't belong in code comments or +commit messages: +- what problem the PR is solving +- what high-level ideas or invariants matter +- what is intentionally out of scope +- what reviewers should pay special attention to +- how the change should be validated + +A PR review brief gives that context a place in the tree where it can evolve alongside the code. + +This also avoids relying on the PR description as the main home for evolving review context. +PR descriptions are useful entry points, but they are not versioned and reviewed like files in the +tree, so review context drift is easier to miss. + +## Default convention + +Commit review briefs in `.review-briefs/` as part of the same PR as the code they describe. + +That keeps the review overview versioned, reviewable, and editable as the implementation changes, +without adding special pre-merge or post-merge workflows. + +This directory may accumulate stale docs over time. That's okay. Periodic cleanup is cheaper than +requiring everyone to manage a separate lifecycle for every PR. + +## How this differs from a design doc + +A review brief may contain some design-doc-like material, but it serves a different purpose. + +Compared with a design doc, a PR review brief is: +- more review-oriented than design-oriented +- usually narrower in scope and tied to a single PR +- allowed to be written before, during, or after implementation starts +- more like an evolved PR description than a standalone architecture document +- not expected to be durable project documentation by default + +If content becomes stable, reusable guidance or durable architectural context, promote it into +`docs/`, this directory's `README.md`, or a package `README.md`. + +## Agent-assisted workflow + +A PR review brief can be agent-maintained. + +A useful default workflow is: +1. Ask an agent to generate a review brief that captures the important high-level details, scope, + invariants, validation, and review focus of the PR. +2. In another session or subagent, review the brief against the code and PR description for + consistency and completeness. +3. Resolve any gaps or inconsistencies by updating the brief, the code, or both. +4. Repeat as the PR evolves. + +The goal is not to have the agent invent rationale after the fact. The goal is to keep the brief +aligned with the code as the PR evolves. + +## What belongs here + +Create a PR review brief when a reviewer would benefit from a more explicit review companion, +especially when: +- the intent is not obvious from the code alone +- there are meaningful tradeoffs or rejected alternatives +- the work spans multiple commits or PRs +- you want a checklist for self-review or agent-assisted review +- you want reviewers to evaluate the change against explicit scope and validation criteria + +Good review-brief content is specific to the PR under review, such as: +- the problem this PR is solving +- the key concepts, invariants, or constraints a reviewer should keep in mind +- the scope and non-goals of this PR +- the high-level shape of the approach +- how the PR should be validated +- where reviewers should focus attention + +## What does not belong here + +Do not use this directory for: +- agent configuration or prompts (`.agents/` is for that) +- durable user/developer documentation that should remain after the PR is merged (`docs/`, + package `README.md`, etc.) +- stable usage guidance for a mechanism or convention once it stops being PR-specific +- changelog-style narration of everything that changed in the diff +- details that should instead be made clear in the code, commit structure, or durable docs +- issue tracking or task management already captured elsewhere +- temporary scratch notes that are not useful for review + +## Lifecycle + +### During PR development + +Use the review brief to: +1. capture the reviewer-facing overview of the PR +2. record important scope boundaries, concepts, and invariants +3. document how the PR should be validated +4. guide self-review and peer review +5. update the brief as understanding evolves +6. periodically review it against the code for consistency and completeness + +A review brief may be written early, but it can also be synthesized later from an implemented PR +if that is the best way to prepare for review. + +### Before merge + +Trim obviously stale sections so the brief reflects the final reviewable shape of the PR, not +every abandoned thought. + +### After merge + +Use judgment: +- If the brief still has value as historical context for the PR, leave it here. +- If some parts describe stable convention or durable behavior, promote those parts into `docs/`, + this directory's `README.md`, or a package `README.md`. +- If it no longer adds value, it can be removed in a later cleanup sweep. + +The default is **no special cleanup step required**. + +## Naming + +Use one file in `.review-briefs/` per PR under review. + +Preferred names: +- `-short-title.md` +- `.md` + +Examples: +- `3333-firewood-commit-path.md` +- `subnet-bootstrap-validation.md` + +Choose names that will still make sense after the branch is gone and the PR has merged. + +## Suggested structure + +A review brief should read more like an evolved PR description than a reduced design doc. +Not every section is required; include only what materially helps review. + +```md +# + +## Overview +A concise, holistic explanation of what the PR is doing and why. + +## Why now +Why this PR exists now. What problem or pressure is it addressing? + +## What reviewers should know +The key concepts, invariants, assumptions, or mental model needed to read the diff well. + +## Scope +What this PR is doing, and what it is intentionally not doing. + +## Approach +The high-level shape of the implementation, only as much as needed to orient review. + +## Validation +How this was or should be validated. + +## Review focus +Where reviewers should spend attention. + +## Follow-ups +What is left for later. +``` + +Optional sections when useful: +- `## Alternatives considered` +- `## References` +- `## Risks` + +See `.review-briefs/introduce-review-briefs.md` for a worked example. + diff --git a/.review-briefs/introduce-review-briefs.md b/.review-briefs/introduce-review-briefs.md new file mode 100644 index 000000000000..f9a900d3265c --- /dev/null +++ b/.review-briefs/introduce-review-briefs.md @@ -0,0 +1,111 @@ +# Introduce PR review briefs + +## Overview + +This PR introduces `.review-briefs/` as a place for a versioned, in-tree long-form review +companion that would otherwise be reconstructed from the PR description, review comments, and the +diff. + +The goal is to make that reviewer-facing overview explicit enough to guide self-review, peer +review, and agent-assisted review. + +## Why now + +Important PR-level context is currently scattered across: +- the PR description +- review comments +- commit messages +- the code itself + +That makes it harder to review a PR against a clear statement of: +- what problem the PR is solving +- which high-level ideas and constraints matter +- what is intentionally out of scope +- how the reviewer should evaluate success + +This PR introduces the review brief as a versioned long-form review companion, so reviewers do not +need to reconstruct that model from the PR description and diff alone. + +A secondary motivation is to avoid relying on the PR description as the main home for evolving +review context. PR descriptions are useful entry points, but they are not versioned and reviewed +like files in the tree, so changes in emphasis or intent are easier to miss. + +## What reviewers should know + +This PR is intentionally defining review briefs as something closer to an evolved PR description +than a reduced design doc. + +The intended mental model is: +- the PR description is the entry point +- the review brief is the versioned long-form review companion when a PR benefits from one +- the brief should help reviewers form the right high-level model without reconstructing it from + the diff alone + +That means the brief is: +- primarily for reviewers +- allowed to be written before, during, or after implementation +- PR-scoped rather than durable by default +- meant to guide review rather than narrate the diff +- meant to supplement, not replace, a concise PR description + +## Scope + +This PR: +- adds a top-level `.review-briefs/` directory +- adds `.review-briefs/README.md` defining the convention +- adds an early "at a glance" framing so readers quickly see what a review brief is and when to + use one +- positions the review brief as the preferred versioned long-form review companion for a PR +- defines an agent-assisted workflow for generating and maintaining the brief +- adds guidance to link the brief from the PR description for discovery +- adds reviewer-expectations guidance near the top of the README +- adds a brief Gerrit commit-message-file analogy under the PR-description relationship +- changes the suggested format to a reviewer-oriented, evolved-PR-description structure + +This PR does not: +- require a review brief for every trivial or mechanical PR +- add tooling, automation, or CI enforcement +- make review briefs durable project documentation by default +- replace concise PR descriptions as the normal entry point for review + +## Approach + +Keep the convention lightweight: +- commit review briefs in `.review-briefs/` as part of the same PR as the code they describe +- use the PR description to point reviewers to the brief +- let the README carry the stable usage guidance +- allow briefs to be created up front or synthesized later for review +- tolerate some stale briefs instead of requiring merge-time cleanup + +## Validation + +Success for this PR is: +- the repository contains `.review-briefs/README.md` +- the README clearly explains what review briefs are for +- the README gives readers an early elevator pitch for what a review brief is and when to use one +- the README distinguishes review briefs from design docs and durable documentation +- the README is explicit that review briefs are for PR review +- the README explains how reviewers discover a brief during review +- the README makes reviewer expectations easy to find near the top +- the README explicitly captures agent-assisted generation and maintenance +- the suggested format reads like an evolved PR description that guides review + +## Review focus + +Focus review on: +- whether `.review-briefs/` is the right scope and level of explicitness +- whether the README now gives readers the right elevator pitch early enough +- whether the README clearly separates review briefs from design docs and durable docs +- whether the PR-description framing is concrete enough to be useful in practice +- whether the Gerrit analogy is clarifying without becoming the main framing +- whether the discoverability guidance is enough to make briefs useful +- whether the agent-assisted workflow is concrete enough to use +- whether the suggested format matches the reviewer-oriented role of the document +- whether the convention feels lightweight and selective enough to adopt in practice + +## Follow-ups + +Possible future follow-ups, intentionally out of scope for this PR: +- add a `TEMPLATE.md` +- mention the convention in `CONTRIBUTING.md` +- add more example briefs after the pattern has been used on more PRs diff --git a/.review-briefs/plan-github-release-process.md b/.review-briefs/plan-github-release-process.md new file mode 100644 index 000000000000..d171ceadf6bc --- /dev/null +++ b/.review-briefs/plan-github-release-process.md @@ -0,0 +1,293 @@ +# GitHub Release Process Automation Plan + +**Goal:** Given a tag, produce a complete GitHub release with all signed artifacts — fully automated. + +**Epic:** [#5157 — Automate release process](https://github.com/ava-labs/avalanchego/issues/5157) + +**Architecture:** A unified GitHub Actions workflow orchestrates platform-specific build jobs, signs all artifacts (GPG via repo secrets for Linux, codesign+notarytool for macOS), uploads packages to S3, and creates the GitHub release with all assets attached. KMS-backed signing is a future hardening step after secrets-based signing is proven. + +--- + +## Current State + +The release process today is semi-automated. Tag push triggers independent build workflows, but signing, artifact collection, and release page creation require manual intervention. + +### What exists + +* **Linux binary tarballs** — built by [`build-linux-binaries.yml`](.github/workflows/build-linux-binaries.yml) + * amd64 and arm64, uploaded to S3 + * GPG signing is **manual** (done offline via `create-github-release.sh`) +* **macOS binaries** — built by [`build-macos-release.yml`](.github/workflows/build-macos-release.yml) + * Uploaded to S3 + * codesign + notarization is **manual** +* **DEB packages** — built by [`build-ubuntu-amd64-release.yml`](.github/workflows/build-ubuntu-amd64-release.yml) and arm64 variant + * Unsigned, uploaded to S3 via `deb-s3` + * GPG signing in progress: PRs [#5179](https://github.com/ava-labs/avalanchego/pull/5179), [#5180](https://github.com/ava-labs/avalanchego/pull/5180) +* **RPM packages** — built by [`build-rpm-release.yml`](.github/workflows/build-rpm-release.yml) + * GPG-signed via repo secret (`RPM_GPG_PRIVATE_KEY`) + * Validated in Rocky Linux 9 container + * Not uploaded to S3, only GitHub Artifacts +* **Docker images** — built by [`publish_docker_image.yml`](.github/workflows/publish_docker_image.yml) + * Fully automated, multi-arch, pushed to DockerHub +* **GitHub release page** — created manually via [`create-github-release.sh`](review-notes/create-github-release.sh) + * Downloads from S3 → signs with GPG → creates release via `gh` + * Assets: 3 tarballs/zips + 3 detached `.sig` files (6 total) + * No DEB/RPM packages attached to release page + +### Active branches + +| Branch | Purpose | PR | +|---|---|---| +| `PlatCore/5109-refactor-rpm-for-reuse-v2` | Parameterize nfpm configs for DEB reuse | [#5179](https://github.com/ava-labs/avalanchego/pull/5179) | +| `PlatCore/5109-add-deb-gpg-signing-v2` | Add GPG-signed DEB packages | [#5180](https://github.com/ava-labs/avalanchego/pull/5180) | +| `PlatCore/5109-add-kms-gpg-signing` | KMS-backed signing for DEB + RPM | [#5136](https://github.com/ava-labs/avalanchego/pull/5136) (draft) | +| `PlatCore/5109-kms-signing-poc` | KMS signing proof of concept | [#5167](https://github.com/ava-labs/avalanchego/pull/5167) (DO NOT MERGE) | + +--- + +## Phase 1: Package Signing Infrastructure + +> Issues: [#5160](https://github.com/ava-labs/avalanchego/issues/5160), [#5193](https://github.com/ava-labs/avalanchego/issues/5193) + +### 1.1 DEB package signing + +* Land nfpm config refactor (PR #5179) + * Parameterize nfpm configs with env vars + * Add `DOCKERFILE` parameter for DEB reuse + * Verify RPM output is unchanged +* Land DEB GPG signing (PR #5180) + * Dockerfile.deb — Ubuntu 22.04 builder with dpkg-sig + * build-deb.sh — build + sign flow mirroring RPM pipeline + * validate-deb.sh — install + signature verification in clean container + * CI workflow for both jammy and noble, amd64 and arm64 + +### 1.2 RPM package signing + +* Verify existing GPG signing via repo secret works end-to-end +* Ensure RPM validation step covers both architectures + +### 1.3 Linux binary tarball signing + +* Add GPG detach-sign step to `build-linux-binaries.yml` + * Sign `.tar.gz` artifacts in CI after build + * Upload `.sig` files alongside tarballs to S3 + * Verify signatures before upload + +--- + +## Phase 2: macOS Notarization + +> Issue: [#5161](https://github.com/ava-labs/avalanchego/issues/5161) + +### 2.1 Automate codesign in CI + +* Store Apple Developer ID certificate in GitHub secrets + * Certificate + private key as base64-encoded P12 + * Keychain setup in workflow (create temp keychain, import cert) +* Add codesign step to `build-macos-release.yml` + * Sign the avalanchego binary + * Sign the subnet-evm binary + * Verify signatures with `codesign --verify` + +### 2.2 Automate notarization in CI (???) + +* Store App Store Connect API key in GitHub secrets + * Issuer ID, Key ID, AuthKey P8 file +* Add xcrun notarytool step after codesign + * Create zip for notarization submission + * Submit via `xcrun notarytool submit --wait` + * Staple notarization ticket (if applicable for non-.app) + * Verify with `spctl --assess` or `xcrun notarytool info` + +--- + +## Phase 3: Package Distribution (S3) + +> Issues: [#5158](https://github.com/ava-labs/avalanchego/issues/5158), [#5159](https://github.com/ava-labs/avalanchego/issues/5159) + +### 3.1 DEB S3 upload + +* Update DEB workflow to upload signed `.deb` files to the correct S3 bucket + * Path: `s3://${BUCKET}/linux/debs/ubuntu/${RELEASE}/${ARCH}/` + * Both jammy and noble + * Both amd64 and arm64 +* Update `deb-s3` upload to `downloads.avax.network` bucket + * Publish signed packages to public APT repository + +### 3.2 RPM S3 upload + +* Add S3 upload step to `build-rpm-release.yml` + * Path: `s3://${BUCKET}/linux/rpms/${DISTRO}/${ARCH}/` + * Both x86_64 and aarch64 +* [Optional] Publish to a public YUM/DNF repository + +### 3.3 Signed tarball + signature upload + +* Ensure `build-linux-binaries.yml` uploads both `.tar.gz` and `.tar.gz.sig` to S3 +* Ensure `build-macos-release.yml` uploads both `.zip` and `.zip.sig` to S3 + +--- + +## Phase 4: GitHub Release Automation + +> Issue: [#5162](https://github.com/ava-labs/avalanchego/issues/5162) + +### 4.1 Orchestrator workflow + +* Create `create-github-release.yml` — top-level workflow triggered by tag push + * Gate on all build workflows completing successfully + * `build-linux-binaries.yml` + * `build-macos-release.yml` + * `build-ubuntu-amd64-release.yml` + * `build-ubuntu-arm64-release.yml` + * `build-rpm-release.yml` + * `publish_docker_image.yml` + * Manual workflow_dispatch fallback with tag input + +### 4.2 Artifact collection + +* Download all signed artifacts from S3 (or from workflow artifacts) + * Linux tarballs + `.sig` (amd64, arm64) + * macOS zip + `.sig` + * DEB packages (jammy, noble × amd64, arm64) + * RPM packages (x86_64, aarch64) +* Verify checksums / signatures before attaching to release + +### 4.3 Release notes generation + +* Extract release notes from a conventional source + * Option A: `CHANGELOG.md` section for the tag + * Option B: Auto-generate from merged PRs since last tag + * Option C: Require `release-notes.md` committed alongside version bump PR +* Format release body with artifact table and install instructions + +### 4.4 GitHub release creation + +* Create release via `gh release create` + * Title: `{CodeName}.{Patch} - {Description}` (e.g., "Granite.2 - Benchlist Redesign") + * Attach all artifacts (binaries + signatures + packages) + * Mark as `latest` (or `prerelease` for `-fuji` tags) +* Expected asset list (target): + * `avalanchego-linux-amd64-${TAG}.tar.gz` + `.sig` + * `avalanchego-linux-arm64-${TAG}.tar.gz` + `.sig` + * `avalanchego-macos-${TAG}.zip` + `.sig` + * `avalanchego-${TAG}-1.x86_64.rpm` + * `avalanchego-${TAG}-1.aarch64.rpm` + * `subnet-evm-${TAG}-1.x86_64.rpm` + * `subnet-evm-${TAG}-1.aarch64.rpm` + * `avalanchego_${TAG}_amd64.deb` (jammy, noble) + * `avalanchego_${TAG}_arm64.deb` (jammy, noble) + * `subnet-evm_${TAG}_amd64.deb` (jammy, noble) + * `subnet-evm_${TAG}_arm64.deb` (jammy, noble) + +### 4.5 Post-release verification + +* Verify asset count and names match expected list +* Spot-check download URLs (HTTP 302) +* Verify `isLatest` flag +* [Optional] Post notification (Slack, etc.) + +--- + +## Phase 5: Observability + +> Issue: [#5163](https://github.com/ava-labs/avalanchego/issues/5163) + +### 5.1 Replace Datadog links with Grafana + +* Identify all Datadog references in release testing docs +* Replace with corresponding Grafana dashboard links + * `grafana.internal/d/api-latency` (oncall latency dashboard) + * Other dashboards TBD + +### 5.2 Release workflow monitoring + +* Add workflow status badges to release documentation +* [Optional] Add Slack notifications for workflow failures +* [Optional] Add Grafana annotations for release events + +--- + +## Phase 6: Validation & Rollout + +### 6.1 End-to-end dry run + +* Test the full pipeline against a pre-release tag (e.g., `v0.0.0-test`) + * Verify all artifacts are built, signed, and uploaded + * Verify GitHub release is created with correct assets + * Verify DEB/RPM packages install correctly from S3 + * Verify macOS binary passes Gatekeeper checks + +### 6.2 Migration from manual process + +* Document the new automated process + * Update the [Rollout Runbook](review-notes/AP-Runbooks-020426-224331-6-.md) + * Remove manual steps that are now automated + * Keep manual fallback instructions for emergencies +* Deprecate `create-github-release.sh` after first successful automated release +* Retire internal repo build triggers where superseded by public repo workflows + +### 6.3 Security review + +* Verify no signing keys are exposed in workflow logs +* Verify OIDC role trust policies are scoped to correct repos/branches +* Verify GPG public keys are distributed for verification + +--- + +## Phase 7: KMS-Backed Signing (Hardening) + +> Issue: [#5193](https://github.com/ava-labs/avalanchego/issues/5193) +> +> **Prerequisite:** Phases 1–4 landed and proven with secrets-based signing. + +### 7.1 Finalize KMS signing infrastructure + +* Finalize KMS signing POC (PR [#5136](https://github.com/ava-labs/avalanchego/pull/5136)) + * AWS OIDC → KMS key access from CI runners + * Single signing identity for DEB, RPM, and tarball `.sig` + * Eliminate GPG private key from GitHub secrets + +### 7.2 Migrate pipelines to KMS signer + +* Update DEB pipeline to use KMS signer +* Update RPM pipeline to use KMS signer +* Update tarball pipeline to use KMS signer +* Verify all artifacts are signed with KMS key +* Update public GPG key distribution + +### 7.3 KMS security review + +* Verify KMS key policy follows least-privilege +* Verify OIDC trust is scoped to release workflows only +* Verify key rotation plan is documented +* Remove deprecated GPG repo secrets + +--- + +## Dependency Graph + +``` +Phase 1 (Signing) ──┬──→ Phase 3 (S3 Distribution) ──→ Phase 4 (GH Release) +Phase 2 (macOS) ───┘ │ + ↓ +Phase 5 (Observability) ←── independent ──→ Phase 6 (Validation) + │ + ↓ + Phase 7 (KMS Hardening) +``` + +## Issue Mapping + +| Phase | Issues | +|---|---| +| 1.1 DEB signing | #5193, #5160 | +| 1.2 RPM signing | #5193 | +| 1.3 Tarball signing | #5160 | +| 2.x macOS notarization | #5161 | +| 3.1 DEB S3 upload | #5158 | +| 3.2 RPM S3 upload | #5159 | +| 4.x GitHub release | #5162 | +| 5.x Observability | #5163 | +| 6.x Validation | #5157 (epic) | +| 7.x KMS signing | #5193 | diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 000000000000..95e38a6a9ddf --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +agents/AGENTS.md \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 252c02246849..be42d3adafad 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -236,6 +236,36 @@ Common ways to run `task` in this repo include: - If your PR isn't ready to be reviewed just yet, you can open it as a draft to collect early feedback on your changes. - Once the PR is ready for review, mark it as ready-for-review and request review from one of the maintainers. +### PR review briefs + +PR review briefs are optional, but encouraged for PRs that would benefit from a reviewer-facing +companion beyond the diff and PR description. + +A good review brief helps reviewers quickly understand the intent, scope, validation, and review +focus of a PR without reconstructing that context from the diff alone. + +If you want to use one: + +1. Read [`.review-briefs/README.md`](./.review-briefs/README.md) for the full convention, + suggested structure, and examples. +1. Create one file per PR in [`.review-briefs/`](./.review-briefs/). +1. Give it a stable, descriptive name such as `<issue-id>-short-title.md` or + `<short-title>.md`. +1. Keep the content focused on helping reviewers understand and evaluate the PR. +1. Mention in the PR description that the PR includes a review brief in `.review-briefs/`. +1. Update the brief as the PR evolves so it stays aligned with the code under review. + +If you want an agent to generate a first draft, a useful prompt is: + +``` +Read .review-briefs/README.md, examine the diff for this change, and generate a PR +review brief in .review-briefs/ that explains the overview, scope, validation, and +review focus for this PR. Keep it aligned with the current code rather than inventing +rationale that is not supported by the diff. Then iteratively review the review brief +with the code for consistency and completeness - in subagents as appropriate to +minimize context pollution - until satisified. +``` + ### Autogenerated code - Any changes to protobuf message types require that protobuf files are regenerated. diff --git a/Taskfile.yml b/Taskfile.yml index 58d0b6b3450d..1e364e30a252 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -501,6 +501,12 @@ tasks: - task: generate-load-contract-bindings - cmd: '{{.NIX_RUN}} bash -x ./scripts/tests.load.kube.kind.sh {{.CLI_ARGS}}' + test-msync-e2e: + desc: Runs standalone merkle sync bootstrap validation + cmds: + - task: build + - cmd: '{{.NIX_RUN}} go run ./tests/msync/main --avalanchego-path=./build/avalanchego {{.CLI_ARGS}}' + test-robustness: desc: Deploys kind with chaos mesh. Intended to eventually run a robustness (fault-injection) test suite. cmds: diff --git a/agents/AGENTS.md b/agents/AGENTS.md new file mode 100644 index 000000000000..d487ae8c6586 --- /dev/null +++ b/agents/AGENTS.md @@ -0,0 +1,75 @@ +# avalanchego repo + +## Worktree Locations + +- **Primary development**: `~/src/avalanchego` +- **Gerrit PR review**: `~/src/avalanchego_gerrit-review` + +The gerrit-review worktree is dedicated to importing GitHub PRs into Gerrit for collaborative review. Do NOT use `~/src/avalanchego-beads` - that's a separate beads-related repo. + +### Remote Configuration + +Both worktrees share the same remotes: +- `origin` → Gerrit (`ssh://pika-svc-gerrit/avalanchego`) +- `upstream` → GitHub (`git@github.com-work:ava-labs/avalanchego`) + +### GitHub Access + +GitHub git operations against `upstream` use SSH and require YubiKey interaction. +For PR metadata and branch checkout in review flows, prefer `gh` over raw git commands +when possible, because `gh` can use `GH_TOKEN` without needing the YubiKey. + +Practical guidance: +- Use `gh pr view`, `gh pr diff`, and `gh pr checkout` first for GitHub PR review work +- Avoid `git fetch upstream ...` unless SSH/YubiKey interaction is explicitly intended +- Keep using `origin` for Gerrit operations per the normal workflow + +## Gerrit PR Review Workflow + +To import a GitHub PR for review in the gerrit-review worktree: + +```bash +cd ~/src/avalanchego_gerrit-review +gerrit-review sync-pr <PR-number> +``` + +This fetches the PR, squashes commits into one, and pushes to Gerrit for review. + +## Development Workflow + +The `Taskfile.yml` at the repo root provides entrypoints to common functionality. Run `task` to see available tasks. + +### After Making Changes + +Always run linting before committing: + +```bash +task lint-all +``` + +This executes golangci-lint, shellcheck, and actionlint in parallel to catch issues early. + +### Error Assertion Guidance + +In avalanchego, forbidigo rules banning `require.Error` and `require.ErrorContains` +in favor of `ErrorIs` are intentional. Do not work around those lint failures by +weakening assertions or switching to manual string checks when the code under test +controls the error being returned. + +When a test wants `ErrorIs` but the implementation only returns a formatted string +for a domain condition we control, treat that lint failure as a prompt to improve +the error API. Introduce or wrap a stable sentinel or typed error so the test can +assert the behavior directly. + +Only fall back to string-based error inspection when the failure comes from a +genuinely opaque external interface that cannot reasonably be normalized into a +stable local error value. + +### Key Tasks + +- `task build` - Build avalanchego +- `task test-unit` - Run unit tests +- `task test-unit-fast` - Run unit tests without race detection (faster) +- `task test-e2e` - Run e2e tests +- `task lint` - Run golangci-lint only +- `task lint-all` - Run all linters (golangci-lint, shellcheck, actionlint) diff --git a/plans/msync-e2e.md b/plans/msync-e2e.md new file mode 100644 index 000000000000..d2903ad2f714 --- /dev/null +++ b/plans/msync-e2e.md @@ -0,0 +1,258 @@ +# Merkle Sync end-to-end validation plan + +## Goal + +Create a standalone test job that: +1. starts an isolated tmpnet network, +2. generates enough chain/state data to meaningfully exercise merkle sync bootstrap, +3. starts a fresh node against that populated network, and +4. validates that the new node can bootstrap successfully. + +This should **not** be added to the shared `tests/e2e` suite. It should instead follow the pattern of other standalone test executables such as `tests/load/main`, with its own task and CI job. + +## Constraints and existing building blocks + +- `tests/fixture/tmpnet` already provides the temporary network lifecycle we need. +- `tests/fixture/e2e` already contains reusable startup/bootstrap logic, especially: + - private network startup patterns + - `CheckBootstrapIsPossible(...)` for adding a fresh node and validating bootstrap +- `tests/load` already shows how to generate C-Chain traffic at sustained volume. +- Initial work should focus on **generate data from scratch**, even if slow. +- A later phase can optimize by restoring from a published archive containing: + - node database/state + - network configuration + - genesis and any other required fixture inputs + +## Phase 1: generate-from-scratch standalone test + +### Desired outcome +A standalone package/executable that: +- provisions a tmpnet network, +- waits for it to become healthy, +- generates a target amount of C-Chain data, +- verifies a new node can bootstrap from that state. + +### Implementation outline +1. **Create a new standalone test package** + - Add a new package for merkle sync validation, separate from `tests/e2e`. + - Add a `main` entrypoint, similar in style to `tests/load/main`. + +2. **Start an isolated tmpnet network** + - Reuse the tmpnet/e2e startup path rather than inventing a new network bootstrap flow. + - Keep the network isolated from other tests and responsible for its own lifecycle. + +3. **Generate load** + - Reuse the existing load-test approach where practical. + - Prefer simple, high-volume C-Chain traffic rather than introducing new custom transaction patterns unless necessary. + - Keep the generation logic parameterized so the target can be adjusted easily. + +4. **Define the initial threshold** + - Do not hardcode a guessed transaction count as the definition of "10 MB worth of blocks". + - Instead, make the initial implementation stop based on an observable target, such as: + - on-disk data size growth for the node/network, or + - a configurable transaction/block target used only as an approximation while measuring the resulting data footprint. + - Emit enough logging/metrics to understand how much data was produced and how long it took. + +5. **Validate bootstrap** + - Reuse the existing bootstrap helper to add a new node and verify it becomes healthy. + - Confirm the existing validator nodes remain healthy afterward. + +6. **Add task wiring** + - Add a dedicated task to build/run this standalone test. + - Ensure it can be invoked locally with the same tmpnet-style flags used by existing standalone tests. + +7. **Add CI wiring** + - Add a dedicated CI job that invokes the new task. + - Keep this job isolated from normal e2e execution due to variable runtime. + +## Phase 2: fixture/archive-based bootstrap validation + +### Desired outcome +Reduce runtime by separating "state generation" from "bootstrap validation". + +### Implementation outline +1. Define the archive contents needed to recreate the populated network state. +2. Add tooling or documentation for producing that archive from Phase 1. +3. Add a mode that restores the archived state instead of generating it from scratch. +4. Reuse the same bootstrap-validation step against restored state. +5. Keep the generate-from-scratch mode available for refreshing the fixture. + +## Research findings: what actually exercises MerkleSync + +### Critical distinction: Firewood is required + +A standalone bootstrap test only exercises the MerkleDB/Firewood sync path when C-Chain runs with: +- `state-scheme = firewood` + +Otherwise bootstrap exercises the older EVM state sync implementation rather than the Merkle sync path under: +- `database/merkle/sync/*` +- `database/merkle/firewood/syncer/*` +- `graft/evm/sync/evmstate/firewood_syncer.go` + +Key runtime-selection reference: +- `graft/evm/sync/engine/client.go` + +### Bootstrap success alone is a false-positive signal + +Even with Firewood enabled, a node may still skip state sync and succeed via ordinary bootstrap. + +In `graft/evm/sync/engine/client.go`, state sync is skipped when the last accepted height is too close to the summary height relative to `state-sync-min-blocks`. + +Implication: +A meaningful MerkleSync e2e must require evidence that: +1. Firewood was enabled, +2. state sync was selected rather than skipped, and +3. the Firewood/Merkle syncer actually ran. + +### Which sync behaviors matter for a fresh-node bootstrap? + +For a fresh bootstrap node, the important path is primarily: +- state summary selection, +- range-proof-based Merkle sync, +- code extraction/fetch from proofs, +- recent block backfill, +- accept synced block and continue bootstrap. + +For this scenario, change-proof coverage is **not** the primary target. +The Firewood adapter reports insufficient history for change proofs and the generic server falls back to range proofs. + +### Block backfill matters too + +The sync registry includes a block syncer, and recent block backfill uses a window of 256 blocks. + +Implication: +A meaningful test should drive the target height above 256 so the backfill path is exercised, not just trie sync. + +### Code sync must be exercised + +The Firewood EVM state syncer extracts code hashes from committed range proofs and enqueues them into the code syncer. + +Implication: +A workload containing only EOAs / plain transfers is insufficient. +The final synced state must include contract accounts with bytecode. + +## Minimal workload composition for a meaningful MerkleSync test + +### Required configuration preconditions +1. **Enable Firewood** + - Use `state-scheme = firewood` +2. **Force state sync to actually run** + - Lower `state-sync-min-blocks` enough for a tmpnet-scale test +3. **Make summaries available quickly** + - Use a practical local `state-sync-commit-interval` + +### Required state/history composition +1. **Deploy at least 2 distinct contracts** + Example candidates: + - `tests/load/contracts/TrieStressTest.sol` + - `tests/load/contracts/LoadSimulator.sol` + + Purpose: + - exercise contract-code sync + - ensure more than one contract/account leaf exists + - avoid a single-contract-only trie shape + +2. **Create many new storage slots** + Example candidates: + - repeated `TrieStressTest.writeValues(...)` + - repeated `LoadSimulator.write(...)` + + Purpose: + - make trie sync nontrivial + - force multiple range-proof rounds rather than a tiny one-shot sync + +3. **Modify existing storage slots** + Example candidate: + - `LoadSimulator.modify(...)` + + Purpose: + - avoid a purely append-only final trie shape + - ensure final state is not degenerate + +4. **Include some plain EOA transfers** + Purpose: + - ensure account trie is not purely contract-centric + +5. **Pad block height above both thresholds** + - above the lowered `state-sync-min-blocks` + - above 256 blocks + + Purpose: + - guarantee real state sync is chosen + - guarantee recent-block backfill is exercised + +## Better success criteria than "10 MB and bootstrap succeeded" + +`10 MB` is only a weak proxy. It mixes block storage, trie state, code storage, indices, snapshots, and other artifacts. Different workloads can produce similar disk growth while exercising very different sync behavior. + +A stronger MerkleSync test should require evidence of most or all of the following: + +1. **Firewood path selected** + - Firewood config enabled + - logs/metrics indicate the Firewood sync path ran + +2. **State sync not skipped** + - no "too close, skipping state sync" path + - positive evidence state sync started + +3. **Merkle/range-proof sync actually iterated** + - more than one proof request / commit round + +4. **Code sync exercised** + - at least one code hash fetched + - preferably more than one distinct bytecode present + +5. **Block backfill exercised** + - target height > 256 + +6. **Post-bootstrap state correctness checks** + Validate via RPC, not only node health: + - deployed contracts exist + - selected storage values match expectations + - balances / token balances match expectations + +## Current implementation status + +A standalone harness now exists at: +- `tests/msync/main/main.go` + +Current state: +- isolated tmpnet startup works +- bootstrap helper reuse works +- write-heavy state generation works +- on-disk growth measurement (`db/` + `chainData/`) works + +Current limitation: +- the harness currently uses disk growth as the stopping condition and does **not yet** guarantee that the Firewood Merkle sync path, code sync, and block backfill paths are all exercised. + +## Open questions to resolve during implementation + +1. What exact local chain config values should be used for: + - `state-scheme = firewood` + - `state-sync-min-blocks` + - `state-sync-commit-interval` + +2. What is the best observable evidence in logs/metrics that confirms: + - state sync started, + - Firewood syncer ran, + - range-proof sync performed multiple rounds, + - code sync executed, + - 256-block backfill executed? + +3. How much of the recommended workload composition should be encoded into the first generate-from-scratch version versus deferred to an archived-state fixture? + +## Simplest high-value next step + +Refactor the standalone harness so that it validates MerkleSync-specific preconditions before worrying about data volume: +1. run C-Chain with Firewood, +2. lower state-sync thresholds so state sync is chosen, +3. ensure summaries are produced quickly, +4. generate a mixed workload containing: + - at least two contracts, + - insert-heavy writes, + - modify-heavy writes, + - some EOA transfers, +5. drive height beyond 256 blocks, +6. add checks that prove state sync was used and post-bootstrap state is correct. + +That will convert the current bootstrap smoke test into a meaningful MerkleSync validation. diff --git a/tests/msync/main/BUILD.bazel b/tests/msync/main/BUILD.bazel new file mode 100644 index 000000000000..34e6662cbf7e --- /dev/null +++ b/tests/msync/main/BUILD.bazel @@ -0,0 +1,26 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library") + +go_library( + name = "main_lib", + srcs = ["main.go"], + importpath = "github.com/ava-labs/avalanchego/tests/msync/main", + visibility = ["//visibility:private"], + deps = [ + "//tests", + "//tests/fixture/e2e", + "//tests/fixture/tmpnet", + "//tests/load/contracts", + "//utils/crypto/secp256k1", + "@com_github_ava_labs_libevm//accounts/abi/bind", + "@com_github_ava_labs_libevm//core/types", + "@com_github_ava_labs_libevm//ethclient", + "@com_github_stretchr_testify//require", + "@org_uber_go_zap//:zap", + ], +) + +go_binary( + name = "main", + embed = [":main_lib"], + visibility = ["//visibility:public"], +) diff --git a/tests/msync/main/main.go b/tests/msync/main/main.go new file mode 100644 index 000000000000..2e756ce6f411 --- /dev/null +++ b/tests/msync/main/main.go @@ -0,0 +1,675 @@ +// Copyright (C) 2019, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package main + +import ( + "context" + "encoding/base64" + "encoding/json" + "flag" + "io/fs" + "maps" + "math/big" + "os" + "path/filepath" + "strings" + "time" + + "github.com/ava-labs/libevm/accounts/abi/bind" + "github.com/ava-labs/libevm/common" + "github.com/ava-labs/libevm/core/types" + "github.com/ava-labs/libevm/crypto" + "github.com/ava-labs/libevm/ethclient" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/chains" + "github.com/ava-labs/avalanchego/config" + "github.com/ava-labs/avalanchego/ids" + "github.com/ava-labs/avalanchego/tests" + "github.com/ava-labs/avalanchego/tests/fixture/e2e" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/load/contracts" + "github.com/ava-labs/avalanchego/utils/crypto/secp256k1" + "github.com/ava-labs/avalanchego/utils/units" + "github.com/ava-labs/avalanchego/vms/secp256k1fx" +) + +const ( + blockchainID = "C" + defaultTargetBytes int64 = 10 * 1024 * 1024 + defaultMinBootstrapHeight uint64 = 300 + defaultBatchSize = 5 + defaultWritesPerTx int64 = 250 + defaultLoadWriteSlots int64 = 64 + defaultLoadModifySlots int64 = 32 + defaultStateSyncMinBlocks uint64 = 32 + defaultStateSyncCommitInterval uint64 = 16 + defaultStateHistory uint64 = 128 + defaultPollingDelay = 2 * time.Second +) + +var ( + defaultGasFeeCap = big.NewInt(300_000_000_000) + defaultGasTipCap = big.NewInt(1_000_000_000) + defaultTransferWei = big.NewInt(1) + + flagVars *e2e.FlagVars + + targetBytes int64 + minBootstrapHeight uint64 + batchSize int + writesPerTx int64 + loadWriteSlots int64 + loadModifySlots int64 + stateSyncMinBlocks uint64 + stateSyncCommitInterval uint64 +) + +type deployedContracts struct { + trieAddress common.Address + trie *contracts.TrieStressTest + loadAddress common.Address + load *contracts.LoadSimulator +} + +type workloadSnapshot struct { + trieArrayLength *big.Int + latestWriteValue *big.Int + latestModifyValue *big.Int + latestEmptySlot *big.Int + latestUnmodifiedSlot *big.Int + transferRecipient common.Address + transferBalance *big.Int +} + +func init() { + flagVars = e2e.RegisterFlags( + e2e.WithDefaultOwner("avalanchego-msync-e2e"), + ) + + flag.Int64Var( + &targetBytes, + "target-bytes", + defaultTargetBytes, + "target growth in bytes for the measured node data before validating bootstrap; set to 0 to disable the size threshold", + ) + flag.Uint64Var( + &minBootstrapHeight, + "min-bootstrap-height", + defaultMinBootstrapHeight, + "minimum C-Chain height required before validating bootstrap to ensure recent block backfill is exercised", + ) + flag.IntVar( + &batchSize, + "batch-size", + defaultBatchSize, + "number of mixed-workload iterations to issue per measurement batch", + ) + flag.Int64Var( + &writesPerTx, + "writes-per-tx", + defaultWritesPerTx, + "number of trie writes to perform per TrieStressTest transaction", + ) + flag.Int64Var( + &loadWriteSlots, + "load-write-slots", + defaultLoadWriteSlots, + "number of LoadSimulator slots to populate per write transaction", + ) + flag.Int64Var( + &loadModifySlots, + "load-modify-slots", + defaultLoadModifySlots, + "number of LoadSimulator slots to modify per modify transaction", + ) + flag.Uint64Var( + &stateSyncMinBlocks, + "state-sync-min-blocks", + defaultStateSyncMinBlocks, + "minimum number of blocks ahead required for the bootstrap node to choose state sync", + ) + flag.Uint64Var( + &stateSyncCommitInterval, + "state-sync-commit-interval", + defaultStateSyncCommitInterval, + "state sync summary interval to use for validator nodes and the bootstrap node", + ) + + flag.Parse() +} + +func main() { + log := tests.NewDefaultLogger("msync-e2e") + tc := tests.NewTestContext(log) + defer tc.RecoverAndExit() + + require := require.New(tc) + require.GreaterOrEqual(targetBytes, int64(0), "target-bytes must be non-negative") + require.Positive(minBootstrapHeight, "min-bootstrap-height must be positive") + require.Positive(batchSize, "batch-size must be positive") + require.Positive(writesPerTx, "writes-per-tx must be positive") + require.Positive(loadWriteSlots, "load-write-slots must be positive") + require.Positive(loadModifySlots, "load-modify-slots must be positive") + require.Positive(stateSyncMinBlocks, "state-sync-min-blocks must be positive") + require.Positive(stateSyncCommitInterval, "state-sync-commit-interval must be positive") + + network := tmpnet.NewDefaultNetwork("avalanchego-msync-e2e") + network.PrimaryChainConfigs = newMerkleSyncPrimaryChainConfigs() + env := e2e.NewTestEnvironment(tc, flagVars, network) + network = env.GetNetwork() + + validator := network.Nodes[0] + pathsToMeasure := []string{ + filepath.Join(validator.DataDir, "db"), + filepath.Join(validator.DataDir, "chainData"), + } + initialSize, err := totalSize(pathsToMeasure...) + require.NoError(err) + + client := newWSClient(tc, network.Nodes) + chainID, err := client.ChainID(tc.DefaultContext()) + require.NoError(err) + + fundingKey := network.PreFundedKeys[0] + transferRecipientKey := network.PreFundedKeys[1] + transferRecipient := crypto.PubkeyToAddress(transferRecipientKey.ToECDSA().PublicKey) + initialRecipientBalance, err := client.BalanceAt(tc.DefaultContext(), transferRecipient, nil) + require.NoError(err) + + contracts := deployContracts(tc, client, chainID, fundingKey) + issueAtomicExportTx(tc, network, fundingKey) + snapshot := generateWorkload(tc, client, chainID, fundingKey, transferRecipient, contracts, pathsToMeasure, initialSize, initialRecipientBalance) + + bootstrapNode := checkMerkleSyncBootstrap(tc, network) + if bootstrapNode != nil { + bootstrapClient := newWSClient(tc, []*tmpnet.Node{bootstrapNode}) + validatePostBootstrapState(tc, bootstrapClient, snapshot, contracts) + validateMerkleSyncEvidence(tc, network, bootstrapNode) + + // SimpleTestContext cleanup runs in registration order rather than LIFO, + // so the network-level cleanup may stop this ephemeral node before the + // bootstrap helper's cleanup runs. Clearing the URI avoids a best-effort + // metrics snapshot against an already-stopped node during cleanup. + bootstrapNode.URI = "" + } +} + +func newMerkleSyncPrimaryChainConfigs() map[string]tmpnet.ConfigMap { + primaryChainConfigs := tmpnet.DefaultChainConfigs() + if _, ok := primaryChainConfigs[blockchainID]; !ok { + primaryChainConfigs[blockchainID] = make(tmpnet.ConfigMap) + } + + maps.Copy(primaryChainConfigs[blockchainID], tmpnet.ConfigMap{ + "state-scheme": "firewood", + "snapshot-cache": 0, + "populate-missing-tries": nil, + "pruning-enabled": true, + "state-sync-enabled": false, + "state-sync-commit-interval": stateSyncCommitInterval, + "commit-interval": stateSyncCommitInterval, + "state-history": defaultStateHistory, + }) + return primaryChainConfigs +} + +func newWSClient(tc tests.TestContext, nodes []*tmpnet.Node) *ethclient.Client { + require := require.New(tc) + wsURIs, err := tmpnet.GetNodeWebsocketURIs(nodes, blockchainID) + require.NoError(err) + if len(wsURIs) == 0 { + require.Len(nodes, 1) + uri := strings.Replace(nodes[0].GetAccessibleURI(), "http://", "ws://", 1) + uri = strings.Replace(uri, "https://", "wss://", 1) + wsURIs = []string{uri + "/ext/bc/" + blockchainID + "/ws"} + } + + client, err := ethclient.Dial(wsURIs[0]) + require.NoError(err) + return client +} + +func deployContracts( + tc tests.TestContext, + client *ethclient.Client, + chainID *big.Int, + fundingKey *secp256k1.PrivateKey, +) deployedContracts { + require := require.New(tc) + txOpts, err := newTxOpts(tc, chainID, fundingKey) + require.NoError(err) + + trieAddress, trieTx, trieContract, err := contracts.DeployTrieStressTest(txOpts, client) + require.NoError(err) + _, err = bind.WaitDeployed(tc.DefaultContext(), client, trieTx) + require.NoError(err) + + txOpts, err = newTxOpts(tc, chainID, fundingKey) + require.NoError(err) + loadAddress, loadTx, loadContract, err := contracts.DeployLoadSimulator(txOpts, client) + require.NoError(err) + _, err = bind.WaitDeployed(tc.DefaultContext(), client, loadTx) + require.NoError(err) + + tc.Log().Info("deployed contracts for merkle sync workload", + zap.Stringer("trieStressAddress", trieAddress), + zap.Stringer("trieStressTxID", trieTx.Hash()), + zap.Stringer("loadSimulatorAddress", loadAddress), + zap.Stringer("loadSimulatorTxID", loadTx.Hash()), + ) + + return deployedContracts{ + trieAddress: trieAddress, + trie: trieContract, + loadAddress: loadAddress, + load: loadContract, + } +} + +func generateWorkload( + tc tests.TestContext, + client *ethclient.Client, + chainID *big.Int, + fundingKey *secp256k1.PrivateKey, + transferRecipient common.Address, + contracts deployedContracts, + pathsToMeasure []string, + initialSize int64, + initialRecipientBalance *big.Int, +) workloadSnapshot { + require := require.New(tc) + + var ( + totalMixedIterations int + totalTrieWrites int64 + totalLoadWrites int64 + latestWriteValue = big.NewInt(0) + latestModifyValue = big.NewInt(0) + transferTotal = new(big.Int) + lastBlockNumber uint64 + ) + + tc.By("generating mixed Firewood-backed workload until bootstrap thresholds are reached", func() { + for { + currentSize, err := totalSize(pathsToMeasure...) + require.NoError(err) + delta := currentSize - initialSize + + headBlock, err := client.BlockNumber(tc.DefaultContext()) + require.NoError(err) + + heightReady := headBlock >= minBootstrapHeight + sizeReady := targetBytes == 0 || delta >= targetBytes + if heightReady && sizeReady { + tc.Log().Info("reached merkle sync workload targets", + zap.Uint64("targetHeight", minBootstrapHeight), + zap.Uint64("headBlock", headBlock), + zap.Int64("targetBytes", targetBytes), + zap.Int64("initialBytes", initialSize), + zap.Int64("currentBytes", currentSize), + zap.Int64("deltaBytes", delta), + zap.Int("totalMixedIterations", totalMixedIterations), + zap.Int64("totalTrieWrites", totalTrieWrites), + zap.Int64("totalLoadWrites", totalLoadWrites), + ) + break + } + + for range batchSize { + iteration := totalMixedIterations + 1 + latestWriteValue = big.NewInt(int64(iteration)) + latestModifyValue = big.NewInt(int64(1_000_000 + iteration)) + + lastBlockNumber = issueContractTx(tc, client, func(txOpts *bind.TransactOpts) (*types.Transaction, error) { + return contracts.trie.WriteValues(txOpts, big.NewInt(writesPerTx)) + }, chainID, fundingKey) + totalTrieWrites += writesPerTx + + lastBlockNumber = issueContractTx(tc, client, func(txOpts *bind.TransactOpts) (*types.Transaction, error) { + return contracts.load.Write(txOpts, big.NewInt(loadWriteSlots), latestWriteValue) + }, chainID, fundingKey) + totalLoadWrites += loadWriteSlots + + if totalLoadWrites >= loadModifySlots { + lastBlockNumber = issueContractTx(tc, client, func(txOpts *bind.TransactOpts) (*types.Transaction, error) { + return contracts.load.Modify(txOpts, big.NewInt(loadModifySlots), latestModifyValue) + }, chainID, fundingKey) + } + + lastBlockNumber = issueTransfer(tc, client, chainID, fundingKey, transferRecipient, defaultTransferWei) + transferTotal.Add(transferTotal, defaultTransferWei) + totalMixedIterations++ + } + + time.Sleep(defaultPollingDelay) + + currentSize, err = totalSize(pathsToMeasure...) + require.NoError(err) + headBlock, err = client.BlockNumber(tc.DefaultContext()) + require.NoError(err) + tc.Log().Info("measured merkle sync workload progress", + zap.Uint64("targetHeight", minBootstrapHeight), + zap.Uint64("headBlock", headBlock), + zap.Int64("targetBytes", targetBytes), + zap.Int64("initialBytes", initialSize), + zap.Int64("currentBytes", currentSize), + zap.Int64("deltaBytes", currentSize-initialSize), + zap.Int("totalMixedIterations", totalMixedIterations), + zap.Int64("totalTrieWrites", totalTrieWrites), + zap.Int64("totalLoadWrites", totalLoadWrites), + zap.Int64("loadModifySlots", loadModifySlots), + zap.Uint64("lastBlockNumber", lastBlockNumber), + ) + } + }) + + return workloadSnapshot{ + trieArrayLength: big.NewInt(totalTrieWrites), + latestWriteValue: new(big.Int).Set(latestWriteValue), + latestModifyValue: new(big.Int).Set(latestModifyValue), + latestEmptySlot: big.NewInt(2 + totalLoadWrites), + latestUnmodifiedSlot: big.NewInt(2 + totalLoadWrites - 1), + transferRecipient: transferRecipient, + transferBalance: new(big.Int).Add(initialRecipientBalance, transferTotal), + } +} + +func issueContractTx( + tc tests.TestContext, + client *ethclient.Client, + issue func(*bind.TransactOpts) (*types.Transaction, error), + chainID *big.Int, + fundingKey *secp256k1.PrivateKey, +) uint64 { + require := require.New(tc) + txOpts, err := newTxOpts(tc, chainID, fundingKey) + require.NoError(err) + + tx, err := issue(txOpts) + require.NoError(err) + + receipt, err := bind.WaitMined(tc.DefaultContext(), client, tx) + require.NoError(err) + require.Equal(types.ReceiptStatusSuccessful, receipt.Status) + return receipt.BlockNumber.Uint64() +} + +func issueTransfer( + tc tests.TestContext, + client *ethclient.Client, + chainID *big.Int, + fundingKey *secp256k1.PrivateKey, + to common.Address, + amount *big.Int, +) uint64 { + require := require.New(tc) + from := crypto.PubkeyToAddress(fundingKey.ToECDSA().PublicKey) + nonce, err := client.PendingNonceAt(tc.DefaultContext(), from) + require.NoError(err) + + tx := types.NewTx(&types.DynamicFeeTx{ + ChainID: chainID, + Nonce: nonce, + To: &to, + Gas: 21_000, + GasFeeCap: new(big.Int).Set(defaultGasFeeCap), + GasTipCap: new(big.Int).Set(defaultGasTipCap), + Value: new(big.Int).Set(amount), + }) + signedTx, err := types.SignTx(tx, types.LatestSignerForChainID(chainID), fundingKey.ToECDSA()) + require.NoError(err) + require.NoError(client.SendTransaction(tc.DefaultContext(), signedTx)) + + receipt, err := bind.WaitMined(tc.DefaultContext(), client, signedTx) + require.NoError(err) + require.Equal(types.ReceiptStatusSuccessful, receipt.Status) + return receipt.BlockNumber.Uint64() +} + +func checkMerkleSyncBootstrap(tc tests.TestContext, network *tmpnet.Network) *tmpnet.Node { + require := require.New(tc) + tc.By("checking if Firewood merkle sync bootstrap is possible with the current network state") + + subnetIDs := make([]string, len(network.Subnets)) + for i, subnet := range network.Subnets { + subnetIDs[i] = subnet.SubnetID.String() + } + flags := tmpnet.FlagsMap{ + config.TrackSubnetsKey: strings.Join(subnetIDs, ","), + } + + chainConfigContent, err := newBootstrapChainConfigContent(network) + require.NoError(err) + flags[config.ChainConfigContentKey] = chainConfigContent + + node := tmpnet.NewEphemeralNode(flags) + require.NoError(network.StartNode(tc.DefaultContext(), node)) + + tc.DeferCleanup(func() { + ctx, cancel := context.WithTimeout(context.Background(), e2e.DefaultTimeout) + defer cancel() + require.NoError(node.Stop(ctx)) + }) + + require.NoError(node.WaitForHealthy(tc.DefaultContext())) + + for _, validator := range network.Nodes { + if validator.IsEphemeral { + continue + } + healthy, err := validator.IsHealthy(tc.DefaultContext()) + require.NoError(err) + require.True(healthy, "primary validator %s is not healthy", validator.NodeID) + } + + return node +} + +func newBootstrapChainConfigContent(network *tmpnet.Network) (string, error) { + chainConfigs := map[string]chains.ChainConfig{} + for alias, flags := range network.PrimaryChainConfigs { + nodeFlags := maps.Clone(flags) + if alias == blockchainID { + maps.Copy(nodeFlags, tmpnet.ConfigMap{ + "state-scheme": "firewood", + "snapshot-cache": 0, + "populate-missing-tries": nil, + "pruning-enabled": true, + "state-sync-enabled": true, + "state-sync-min-blocks": stateSyncMinBlocks, + "state-sync-commit-interval": stateSyncCommitInterval, + "commit-interval": stateSyncCommitInterval, + "state-history": defaultStateHistory, + }) + } + marshaledFlags, err := json.Marshal(nodeFlags) + if err != nil { + return "", err + } + chainConfigs[alias] = chains.ChainConfig{Config: marshaledFlags} + } + + marshaledConfigs, err := json.Marshal(chainConfigs) + if err != nil { + return "", err + } + return base64.StdEncoding.EncodeToString(marshaledConfigs), nil +} + +func validatePostBootstrapState( + tc tests.TestContext, + client *ethclient.Client, + snapshot workloadSnapshot, + contracts deployedContracts, +) { + require := require.New(tc) + ctx := tc.DefaultContext() + + trieCode, err := client.CodeAt(ctx, contracts.trieAddress, nil) + require.NoError(err) + require.NotEmpty(trieCode, "TrieStressTest code should exist after bootstrap") + + loadCode, err := client.CodeAt(ctx, contracts.loadAddress, nil) + require.NoError(err) + require.NotEmpty(loadCode, "LoadSimulator code should exist after bootstrap") + + trieLength, err := client.StorageAt(ctx, contracts.trieAddress, storageSlotKey(0), nil) + require.NoError(err) + require.Zero(snapshot.trieArrayLength.Cmp(new(big.Int).SetBytes(trieLength)), "unexpected TrieStressTest array length") + + latestEmptySlot, err := client.StorageAt(ctx, contracts.loadAddress, storageSlotKey(1), nil) + require.NoError(err) + require.Zero(snapshot.latestEmptySlot.Cmp(new(big.Int).SetBytes(latestEmptySlot)), "unexpected latestEmptySlot value") + + modifiedSlot, err := client.StorageAt(ctx, contracts.loadAddress, storageSlotKey(2), nil) + require.NoError(err) + require.Zero(snapshot.latestModifyValue.Cmp(new(big.Int).SetBytes(modifiedSlot)), "unexpected modified storage value") + + latestWrittenSlot, err := client.StorageAt(ctx, contracts.loadAddress, storageSlotBig(snapshot.latestUnmodifiedSlot), nil) + require.NoError(err) + require.Zero(snapshot.latestWriteValue.Cmp(new(big.Int).SetBytes(latestWrittenSlot)), "unexpected latest written storage value") + + balance, err := client.BalanceAt(ctx, snapshot.transferRecipient, nil) + require.NoError(err) + require.Zero(snapshot.transferBalance.Cmp(balance), "unexpected recipient balance after bootstrap") +} + +func issueAtomicExportTx( + tc tests.TestContext, + network *tmpnet.Network, + senderKey *secp256k1.PrivateKey, +) { + require := require.New(tc) + nodeURIs := network.GetNodeURIs() + require.NotEmpty(nodeURIs) + + recipientKey := e2e.NewPrivateKey(tc) + keychain := secp256k1fx.NewKeychain(senderKey, recipientKey) + wallet := e2e.NewWallet(tc, keychain, nodeURIs[0]) + xContext := wallet.X().Builder().Context() + + exportOutputs := []*secp256k1fx.TransferOutput{{ + Amt: units.Avax, + OutputOwners: secp256k1fx.OutputOwners{ + Threshold: 1, + Addrs: []ids.ShortID{ + keychain.Keys[0].Address(), + }, + }, + }} + + _, err := wallet.C().IssueExportTx( + xContext.BlockchainID, + exportOutputs, + tc.WithDefaultContext(), + ) + require.NoError(err) + + tc.Log().Info("issued C-Chain export transaction to populate atomic trie", + zap.Stringer("destinationChainID", xContext.BlockchainID), + zap.Uint64("amount", units.Avax), + ) +} + +func validateMerkleSyncEvidence(tc tests.TestContext, network *tmpnet.Network, bootstrapNode *tmpnet.Node) { + require := require.New(tc) + + bootstrapMetrics, err := tests.GetNodeMetrics(tc.DefaultContext(), bootstrapNode.URI) + require.NoError(err) + firewoodRequests, ok := tests.GetMetricValue(bootstrapMetrics, "avalanche_evm_sync_firewood_sync_requests_made", prometheus.Labels{"chain": blockchainID}) + require.True(ok, "expected bootstrap node firewood sync metric") + require.Greater(firewoodRequests, float64(0), "expected bootstrap node to make firewood proof requests") + + validatorURIs := make([]string, 0, len(network.Nodes)) + for _, node := range network.Nodes { + if node.IsEphemeral { + continue + } + validatorURIs = append(validatorURIs, node.URI) + } + validatorMetrics, err := tests.GetNodesMetrics(tc.DefaultContext(), validatorURIs) + require.NoError(err) + require.Greater(sumMetric(validatorMetrics, "avalanche_evm_eth_code_request_count", prometheus.Labels{"chain": blockchainID}), float64(0), "expected validators to serve code sync requests") + require.Greater(sumMetric(validatorMetrics, "avalanche_evm_eth_block_request_count", prometheus.Labels{"chain": blockchainID}), float64(0), "expected validators to serve block backfill requests") + + bootstrapLogPath := filepath.Join(bootstrapNode.DataDir, "logs", "C.log") + bootstrapLog, err := os.ReadFile(bootstrapLogPath) + require.NoError(err) + bootstrapLogText := string(bootstrapLog) + require.Contains(bootstrapLogText, "Firewood state scheme is enabled") + require.Contains(bootstrapLogText, "state sync started") + require.Contains(bootstrapLogText, "Firewood EVM State Syncer") + require.Contains(bootstrapLogText, "Code Syncer") + require.NotContains(bootstrapLogText, "last accepted too close to most recent syncable block, skipping state sync") +} + +func sumMetric(allMetrics tests.NodesMetrics, metricName string, labels prometheus.Labels) float64 { + var total float64 + for _, nodeMetrics := range allMetrics { + value, ok := tests.GetMetricValue(nodeMetrics, metricName, labels) + if ok { + total += value + } + } + return total +} + +func storageSlotKey(slot uint64) common.Hash { + return common.BigToHash(new(big.Int).SetUint64(slot)) +} + +func storageSlotBig(slotValue *big.Int) common.Hash { + return common.BigToHash(slotValue) +} + +func newTxOpts(tc tests.TestContext, chainID *big.Int, fundingKey *secp256k1.PrivateKey) (*bind.TransactOpts, error) { + txOpts, err := bind.NewKeyedTransactorWithChainID(fundingKey.ToECDSA(), chainID) + if err != nil { + return nil, err + } + txOpts.Context = tc.DefaultContext() + txOpts.GasFeeCap = new(big.Int).Set(defaultGasFeeCap) + txOpts.GasTipCap = new(big.Int).Set(defaultGasTipCap) + return txOpts, nil +} + +func totalSize(paths ...string) (int64, error) { + var total int64 + for _, path := range paths { + size, err := dirSize(path) + if err != nil { + return 0, err + } + total += size + } + return total, nil +} + +func dirSize(root string) (int64, error) { + var total int64 + err := filepath.WalkDir(root, func(_ string, d fs.DirEntry, err error) error { + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + if d.IsDir() { + return nil + } + info, err := d.Info() + if err != nil { + return err + } + total += info.Size() + return nil + }) + if os.IsNotExist(err) { + return 0, nil + } + return total, err +}