-
Notifications
You must be signed in to change notification settings - Fork 23
fix(commands): apply batch — multi-node, node-file patch merge, error handling #128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
84dcf99
8c0148c
2a3dcaa
03f1bfb
c4b5a5d
1164b5e
dcb214c
b7df415
405b5ca
5a16ad3
599079d
66a9489
265bff5
294b4ce
6d1339b
6403d1e
c5555c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -140,6 +140,42 @@ Re-template and update generated file in place (this will overwrite it): | |
| talm template -f nodes/node1.yaml -I | ||
| ``` | ||
|
|
||
| > **Per-node patches inside node files.** A node file can carry Talos config | ||
| > below its modeline (for example, a custom `hostname`, secondary | ||
| > interfaces with `deviceSelector`, VIP placement, or extra etcd args). | ||
| > When `talm apply -f node.yaml` runs the template-rendering branch, that | ||
| > body is applied as a strategic merge patch on top of the rendered | ||
| > template before the result is sent to the node — so per-node fields | ||
| > survive even when the template auto-generates conflicting values | ||
| > (e.g. `hostname: talos-XXXXX`). | ||
| > | ||
| > **Talos v1.12+ caveat.** The multi-document output format introduced | ||
| > in v1.12 splits network configuration into typed documents | ||
| > (`LinkConfig`, `BondConfig`, `VLANConfig`, `Layer2VIPConfig`, | ||
| > `HostnameConfig`, `ResolverConfig`). Legacy node-body fields under | ||
| > `machine.network.interfaces` have no safe 1:1 mapping to those types, | ||
| > so the multi-doc path does not translate them — if you target a | ||
| > v1.12+ Talos node, pin per-node network settings by patching the | ||
| > typed resources (e.g. a `LinkConfig` document below the modeline) | ||
| > rather than legacy `machine.network.interfaces`. Fields outside the | ||
| > network area (`machine.network.hostname` via `HostnameConfig`, | ||
| > `machine.install.disk`, extra etcd args, etc.) still merge as | ||
| > expected. | ||
| > | ||
| > **One body, one node.** A non-empty body is a per-node pin, so the | ||
| > modeline for that file must target exactly one node. `talm apply` | ||
| > refuses a multi-node modeline when the body is non-empty; modeline- | ||
| > only files (no body) are still allowed and drive the same rendered | ||
| > template on every listed target. | ||
| > | ||
| > `talm template -f node.yaml` (with or without `-I`) does **not** apply | ||
| > the same overlay: its output is the rendered template plus the modeline | ||
| > and the auto-generated warning, byte-identical to what the template | ||
| > alone would produce. Routing it through the patcher would drop every | ||
| > YAML comment (including the modeline) and re-sort keys, breaking | ||
| > downstream commands that read the file back. Use `apply --dry-run` if | ||
| > you want to preview the exact bytes that will be sent to the node. | ||
|
|
||
| ## Using talosctl commands | ||
|
|
||
|
Comment on lines
140
to
180
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. multi-paragraph blockquote Document looks great; one nit: the four Not actionable; just an observation.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Noted — agreed the four bold-lead paragraphs are easier to skim on GitHub than in a terminal. Will refactor as a nested bullet list under one |
||
| Talm offers a similar set of commands to those provided by talosctl. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -113,32 +113,41 @@ func apply(args []string) error { | |
| withSecretsPath := ResolveSecretsPath(applyCmdFlags.withSecrets) | ||
|
|
||
| if len(modelineTemplates) > 0 { | ||
| // Template rendering path: connect to the node first, render templates | ||
| // online (so lookup() functions resolve real discovery data), then apply. | ||
| // Template rendering path: render templates online per node and | ||
| // apply the rendered config plus the node file overlay. See | ||
| // applyTemplatesPerNode for why the loop is mandatory. | ||
| opts := buildApplyRenderOptions(modelineTemplates, withSecretsPath) | ||
| nodes := append([]string(nil), GlobalArgs.Nodes...) | ||
| fmt.Printf("- talm: file=%s, nodes=%s, endpoints=%s\n", configFile, nodes, GlobalArgs.Endpoints) | ||
|
|
||
| err = withApplyClient(func(ctx context.Context, c *client.Client) error { | ||
| fmt.Printf("- talm: file=%s, nodes=%s, endpoints=%s\n", configFile, GlobalArgs.Nodes, GlobalArgs.Endpoints) | ||
|
|
||
| result, err := engine.Render(ctx, c, opts) | ||
| if err != nil { | ||
| return fmt.Errorf("template rendering error: %w", err) | ||
| } | ||
|
|
||
| applyClosure := func(ctx context.Context, c *client.Client, data []byte) error { | ||
| resp, err := c.ApplyConfiguration(ctx, &machineapi.ApplyConfigurationRequest{ | ||
| Data: result, | ||
| Data: data, | ||
| Mode: applyCmdFlags.Mode.Mode, | ||
| DryRun: applyCmdFlags.dryRun, | ||
| TryModeTimeout: durationpb.New(applyCmdFlags.configTryTimeout), | ||
| }) | ||
| if err != nil { | ||
| return fmt.Errorf("error applying new configuration: %w", err) | ||
| } | ||
|
|
||
| helpers.PrintApplyResults(resp) | ||
|
|
||
| return nil | ||
| }) | ||
| } | ||
|
|
||
| if applyCmdFlags.insecure { | ||
| openClient := openClientPerNodeMaintenance(applyCmdFlags.certFingerprints, WithClientMaintenance) | ||
| if err := applyTemplatesPerNode(opts, configFile, nodes, openClient, engine.Render, applyClosure); err != nil { | ||
| return err | ||
| } | ||
| } else { | ||
| if err := withApplyClientBare(func(parentCtx context.Context, c *client.Client) error { | ||
| resolved := resolveAuthTemplateNodes(nodes, c) | ||
| openClient := openClientPerNodeAuth(parentCtx, c) | ||
|
Comment on lines
+143
to
+145
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the authenticated template rendering path, the list of nodes is captured from if err := withApplyClientBare(func(parentCtx context.Context, c *client.Client) error {
if len(nodes) == 0 {
if configContext := c.GetConfigContext(); configContext != nil {
nodes = configContext.Nodes
}
}
openClient := openClientPerNodeAuth(parentCtx, c)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added the same fallback in 5a16ad3. Extracted |
||
| return applyTemplatesPerNode(opts, configFile, resolved, openClient, engine.Render, applyClosure) | ||
| }); err != nil { | ||
| return err | ||
| } | ||
| } | ||
| } else { | ||
| // Direct patch path: apply config file as patch against empty bundle | ||
| opts := buildApplyPatchOptions(withSecretsPath) | ||
|
|
@@ -173,9 +182,6 @@ func apply(args []string) error { | |
| return err | ||
| } | ||
| } | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // Reset args | ||
| if !applyCmdFlags.nodesFromArgs { | ||
|
|
@@ -188,23 +194,178 @@ func apply(args []string) error { | |
| return nil | ||
| } | ||
|
|
||
| // withApplyClient creates a Talos client appropriate for the current apply mode | ||
| // and invokes the given action with it. | ||
| // withApplyClient creates a Talos client appropriate for the current apply | ||
| // mode and invokes the given action with it. The action receives a context | ||
| // in which gRPC node metadata is set to the resolved node list — either | ||
| // GlobalArgs.Nodes (when set) or the talosconfig context's Nodes (when not). | ||
| // Used by the direct-patch branch where multi-node fan-out happens at the | ||
| // gRPC layer inside ApplyConfiguration. | ||
| func withApplyClient(f func(ctx context.Context, c *client.Client) error) error { | ||
| return withApplyClientBare(wrapWithNodeContext(f)) | ||
| } | ||
|
|
||
| // withApplyClientBare connects to Talos for the current apply mode but does | ||
| // NOT inject node metadata into the context — leaving that decision to the | ||
| // caller. Used by the template-rendering path (see applyTemplatesPerNode for | ||
| // the rationale). | ||
| func withApplyClientBare(f func(ctx context.Context, c *client.Client) error) error { | ||
| if applyCmdFlags.insecure { | ||
| // Maintenance mode connects directly to the node IP without talosconfig; | ||
| // node context injection is not needed — the maintenance client handles | ||
| // node targeting internally via GlobalArgs.Nodes. | ||
| // Maintenance mode reads its endpoints directly from | ||
| // GlobalArgs.Nodes — gRPC node metadata is not consulted. | ||
| return WithClientMaintenance(applyCmdFlags.certFingerprints, f) | ||
| } | ||
|
|
||
| wrappedF := wrapWithNodeContext(f) | ||
|
|
||
| if GlobalArgs.SkipVerify { | ||
| return WithClientSkipVerify(wrappedF) | ||
| return WithClientSkipVerify(f) | ||
| } | ||
|
|
||
| return WithClientNoNodes(wrappedF) | ||
| return WithClientNoNodes(f) | ||
| } | ||
|
|
||
| // renderFunc, applyFunc and openClientFunc are injection points for | ||
| // applyTemplatesPerNode so unit tests can drive the loop with fakes instead | ||
| // of a real Talos client. | ||
| type renderFunc func(ctx context.Context, c *client.Client, opts engine.Options) ([]byte, error) | ||
| type applyFunc func(ctx context.Context, c *client.Client, data []byte) error | ||
|
|
||
| // openClientFunc opens a Talos client suitable for a single node and runs | ||
| // action with it. Authenticated mode reuses one parent client and rotates | ||
| // the node via single-target gRPC metadata (client.WithNode); insecure | ||
| // (maintenance) mode opens a fresh single-endpoint client per node because | ||
| // Talos's maintenance client ignores node metadata in the context and | ||
| // round-robins between its configured endpoints. | ||
| type openClientFunc func(node string, action func(ctx context.Context, c *client.Client) error) error | ||
|
|
||
| // applyTemplatesPerNode runs render → MergeFileAsPatch → apply once per | ||
| // node. Two reasons it has to iterate: | ||
| // | ||
| // - engine.Render's FailIfMultiNodes guard rejects a context that carries | ||
| // more than one node, so the auth-mode caller has to attach a single | ||
| // node per iteration — and discovery via lookup() should resolve each | ||
| // node's own topology in any case. | ||
| // - In insecure (maintenance) mode the client connects directly to a | ||
| // Talos node and ignores nodes-in-context entirely, so each node needs | ||
| // its own client; otherwise gRPC round-robins ApplyConfiguration | ||
| // across the endpoint list and most nodes never see the config. | ||
| // | ||
| // Both modes share this loop via openClient. | ||
| func applyTemplatesPerNode( | ||
| opts engine.Options, | ||
| configFile string, | ||
| nodes []string, | ||
| openClient openClientFunc, | ||
| render renderFunc, | ||
| apply applyFunc, | ||
| ) error { | ||
| if len(nodes) == 0 { | ||
| return fmt.Errorf("nodes are not set for the command: please use '--nodes' flag, the node file modeline, or talosconfig context to set the nodes to run the command against") | ||
| } | ||
| // A node-file body (hostname, address, VIP, etc.) is a per-node | ||
| // pin. Replaying it across multiple targets would stamp the same | ||
| // value onto every machine, so reject this combination early and | ||
| // ask the user to split the file. Modeline-only files (no body) | ||
| // are fine — they just carry the target list. | ||
| if len(nodes) > 1 { | ||
| hasOverlay, err := engine.NodeFileHasOverlay(configFile) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| if hasOverlay { | ||
| return fmt.Errorf( | ||
| "node file %s targets %d nodes (%v) but carries a non-empty per-node body; the same body would be stamped onto every node. Split it into one file per node, or remove the per-node fields if you want the rendered template alone applied to each", | ||
| configFile, len(nodes), nodes, | ||
| ) | ||
| } | ||
| } | ||
| for _, node := range nodes { | ||
| if err := openClient(node, func(ctx context.Context, c *client.Client) error { | ||
| return renderMergeAndApply(ctx, c, opts, configFile, render, apply) | ||
| }); err != nil { | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| return fmt.Errorf("node %s: %w", node, err) | ||
| } | ||
| } | ||
|
Comment on lines
+280
to
+286
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fail-fast on first failing node is a behavioral change for _, node := range nodes {
if err := openClient(node, func(ctx context.Context, c *client.Client) error {
return renderMergeAndApply(ctx, c, opts, configFile, render, apply)
}); err != nil {
return fmt.Errorf("node %s: %w", node, err)
}
}Previously, batched
Not blocking — flag for the maintainer's call.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Going with fail-fast as the deliberate behavior, and I'll spell that out in the PR description rather than rely on the reader to infer it from the loop. The reasoning: for a multi-control-plane bootstrap, continuing past a node-2 failure to apply node-3 risks driving the cluster's quorum into a worse state than stopping at the first failure does — the operator has to retry node 2 either way, but a partial-but-uniform application is easier to reason about than a partial-and-skipping one. The aggregate-and-continue alternative is reasonable for a |
||
| return nil | ||
| } | ||
|
|
||
| // maintenanceClientFunc is the contract WithClientMaintenance satisfies in | ||
| // production and a fake satisfies in tests. Injection lets the unit tests | ||
| // run the real openClientPerNodeMaintenance body without dialing a Talos | ||
| // node. | ||
| type maintenanceClientFunc func(fingerprints []string, action func(ctx context.Context, c *client.Client) error) error | ||
|
|
||
| // openClientPerNodeMaintenance returns an openClientFunc that opens a | ||
| // fresh single-endpoint maintenance client per node. Multi-node insecure | ||
| // apply (first bootstrap of a multi-node cluster) needs this because | ||
| // WithClientMaintenance creates a client with all endpoints and gRPC then | ||
| // round-robins ApplyConfiguration across them — most nodes never see the | ||
| // config. Narrowing GlobalArgs.Nodes to the current iteration's node and | ||
| // restoring it via defer keeps the wrapper's signature unchanged. | ||
| // | ||
| // Correctness depends on mkClient reading GlobalArgs.Nodes synchronously | ||
| // — i.e. before action returns. WithClientMaintenance does this today | ||
| // (it builds the client with the endpoints and calls action inside the | ||
| // same goroutine). A future Talos refactor that defers endpoint | ||
| // resolution onto a goroutine, or that captures the slice for later | ||
| // use, would silently break this contract; an upstream overload that | ||
| // takes endpoints as a parameter would be the durable fix. | ||
| // | ||
| // mkClient is normally WithClientMaintenance; tests pass a fake that | ||
| // captures the GlobalArgs.Nodes value at the moment WithClientMaintenance | ||
| // would have read it. | ||
| func openClientPerNodeMaintenance(fingerprints []string, mkClient maintenanceClientFunc) openClientFunc { | ||
| return func(node string, action func(ctx context.Context, c *client.Client) error) error { | ||
| savedNodes := append([]string(nil), GlobalArgs.Nodes...) | ||
| GlobalArgs.Nodes = []string{node} | ||
| defer func() { GlobalArgs.Nodes = savedNodes }() | ||
| return mkClient(fingerprints, action) | ||
| } | ||
| } | ||
|
Comment on lines
+315
to
+322
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. global-state mutation as the channel into func openClientPerNodeMaintenance(fingerprints []string, mkClient maintenanceClientFunc) openClientFunc {
return func(node string, action func(ctx context.Context, c *client.Client) error) error {
savedNodes := append([]string(nil), GlobalArgs.Nodes...)
GlobalArgs.Nodes = []string{node}
defer func() { GlobalArgs.Nodes = savedNodes }()
return mkClient(fingerprints, action)
}
}The narrow-and-restore pattern works (defer fires on panic and on
t.Run("restores GlobalArgs.Nodes when action errors", func(t *testing.T) {
GlobalArgs.Nodes = []string{"orig"}
defer func() { GlobalArgs.Nodes = nil }()
openClient := openClientPerNodeMaintenance(nil, fakeMaintenance)
_ = openClient("10.0.0.1", func(_ context.Context, _ *client.Client) error {
return fmt.Errorf("boom")
})
if !slices.Equal(GlobalArgs.Nodes, []string{"orig"}) {
t.Errorf("GlobalArgs.Nodes not restored after error path: got %v", GlobalArgs.Nodes)
}
})Neither point blocks merge.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both notes addressed. Note 1 — added a paragraph to Note 2 — added |
||
|
|
||
| // openClientPerNodeAuth returns an openClientFunc that reuses one | ||
| // authenticated client (the one withApplyClientBare opened above this | ||
| // callback) and rotates the addressed node via client.WithNode on the | ||
| // per-iteration context. WithNode (rather than WithNodes) sets the | ||
| // "node" metadata key for single-target proxying, which engine.Render's | ||
| // FailIfMultiNodes guard treats as one node. | ||
| func openClientPerNodeAuth(parentCtx context.Context, c *client.Client) openClientFunc { | ||
| return func(node string, action func(ctx context.Context, c *client.Client) error) error { | ||
| return action(client.WithNode(parentCtx, node), c) | ||
| } | ||
| } | ||
|
|
||
| // resolveAuthTemplateNodes returns the node list the authenticated | ||
| // template-rendering path should iterate over. cliNodes (from --nodes | ||
| // or the modeline) takes precedence; when empty, the talosconfig | ||
| // context's Nodes are used so a user who already ran `talosctl config | ||
| // node <ip>` does not have to repeat the node list on every `talm | ||
| // apply`. Mirrors wrapWithNodeContext's fallback for the direct-patch | ||
| // branch. Insecure mode does not call this helper — maintenance | ||
| // clients talk to node IPs directly and have no talosconfig context. | ||
| func resolveAuthTemplateNodes(cliNodes []string, c *client.Client) []string { | ||
| if len(cliNodes) > 0 { | ||
| return cliNodes | ||
| } | ||
| if c == nil { | ||
| return nil | ||
| } | ||
| cfg := c.GetConfigContext() | ||
| if cfg == nil { | ||
| return nil | ||
| } | ||
| return append([]string(nil), cfg.Nodes...) | ||
| } | ||
|
|
||
| // renderMergeAndApply is the per-node body shared by every apply mode. | ||
| func renderMergeAndApply(ctx context.Context, c *client.Client, opts engine.Options, configFile string, render renderFunc, apply applyFunc) error { | ||
| rendered, err := render(ctx, c, opts) | ||
| if err != nil { | ||
| return fmt.Errorf("template rendering: %w", err) | ||
| } | ||
| merged, err := engine.MergeFileAsPatch(rendered, configFile) | ||
| if err != nil { | ||
| return fmt.Errorf("merging node file as patch: %w", err) | ||
| } | ||
| return apply(ctx, c, merged) | ||
| } | ||
|
|
||
| // buildApplyRenderOptions constructs engine.Options for the template rendering path. | ||
|
|
@@ -214,14 +375,14 @@ func withApplyClient(f func(ctx context.Context, c *client.Client) error) error | |
| func buildApplyRenderOptions(modelineTemplates []string, withSecretsPath string) engine.Options { | ||
| resolvedTemplates := resolveTemplatePaths(modelineTemplates, Config.RootDir) | ||
| return engine.Options{ | ||
| Insecure: applyCmdFlags.insecure, | ||
| TalosVersion: applyCmdFlags.talosVersion, | ||
| WithSecrets: withSecretsPath, | ||
| KubernetesVersion: applyCmdFlags.kubernetesVersion, | ||
| Debug: applyCmdFlags.debug, | ||
| Full: true, | ||
| Root: Config.RootDir, | ||
| TemplateFiles: resolvedTemplates, | ||
| CommandName: "talm apply", | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.