diff --git a/.golangci.yml b/.golangci.yml index fcc09cf9..88ed01b1 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -91,7 +91,7 @@ linters: - name: cyclomatic arguments: # lower this after refactoring - - 29 + - 30 - name: enforce-switch-style disabled: true - name: flag-parameter @@ -100,7 +100,7 @@ linters: arguments: # lower this after refactoring - 74 - - 153 + - 160 - name: identical-switch-branches disabled: true - name: import-alias-naming diff --git a/cmd/root.go b/cmd/root.go index b154ef61..6664902e 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -12,6 +12,7 @@ import ( "slices" "strings" "syscall" + "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -34,6 +35,39 @@ const bouncerType = "crowdsec-firewall-bouncer" var errSignalShutdown = errors.New("signal shutdown") +// runHealthChecker periodically checks the health of the backend. +// If critical infrastructure is missing, it returns an error to trigger a process restart. +func runHealthChecker(ctx context.Context, b *backend.BackendCTX, config *cfg.BouncerConfig) error { + interval, err := time.ParseDuration(config.HealthConfig.CheckInterval) + if err != nil { + log.Warnf("invalid health check interval '%s', using default 30s", config.HealthConfig.CheckInterval) + interval = 30 * time.Second + } + + ticker := time.NewTicker(interval) + defer ticker.Stop() + + log.Infof("Health checker started with interval %s", interval) + + for { + select { + case <-ctx.Done(): + log.Debug("Health checker stopping due to context cancellation") + return nil + case <-ticker.C: + health := b.CheckHealth() + + if health.Healthy { + log.Debugf("Health check passed: %+v", health.Details) + continue + } + + log.Errorf("Critical: firewall infrastructure missing, triggering restart: %+v", health.Details) + return backend.ErrUnrecoverable + } + } +} + func backendCleanup(backend *backend.BackendCTX) { log.Info("Shutting down backend") @@ -250,6 +284,13 @@ func Execute() error { }() } + // Start health checker goroutine if enabled + if config.HealthConfig.Enabled { + g.Go(func() error { + return runHealthChecker(ctx, backend, config) + }) + } + g.Go(func() error { log.Infof("Processing new and deleted decisions . . .") @@ -268,12 +309,8 @@ func Execute() error { } }) - if config.Daemon != nil { - if *config.Daemon { - log.Debug("Ignoring deprecated 'daemonize' option") - } else { - log.Warn("The 'daemonize' config option is deprecated and treated as always true") - } + if config.Daemon != nil && !*config.Daemon { + log.Warn("The 'daemonize' config option is deprecated and treated as always true") } _ = csdaemon.Notify(csdaemon.Ready, log.StandardLogger()) diff --git a/pkg/backend/backend.go b/pkg/backend/backend.go index af66f990..bc93ee37 100644 --- a/pkg/backend/backend.go +++ b/pkg/backend/backend.go @@ -17,6 +17,10 @@ import ( "github.com/crowdsecurity/cs-firewall-bouncer/pkg/types" ) +// ErrUnrecoverable is returned when the firewall infrastructure is missing +// and the process should restart to recover. +var ErrUnrecoverable = errors.New("firewall infrastructure missing, restart required") + type BackendCTX struct { firewall types.Backend } @@ -46,6 +50,11 @@ func (b *BackendCTX) CollectMetrics() { b.firewall.CollectMetrics() } +// CheckHealth returns the current health status of the backend. +func (b *BackendCTX) CheckHealth() types.HealthStatus { + return b.firewall.CheckHealth() +} + func isPFSupported(runtimeOS string) bool { var supported bool diff --git a/pkg/cfg/config.go b/pkg/cfg/config.go index b9adabfe..441cae45 100644 --- a/pkg/cfg/config.go +++ b/pkg/cfg/config.go @@ -18,6 +18,11 @@ type PrometheusConfig struct { ListenPort string `yaml:"listen_port"` } +type HealthConfig struct { + Enabled bool `yaml:"enabled"` + CheckInterval string `yaml:"check_interval"` +} + type nftablesFamilyConfig struct { Enabled *bool `yaml:"enabled"` SetOnly bool `yaml:"set-only"` @@ -69,6 +74,7 @@ type BouncerConfig struct { BatchSize int `yaml:"batch_size"` } `yaml:"pf"` PrometheusConfig PrometheusConfig `yaml:"prometheus"` + HealthConfig HealthConfig `yaml:"health"` } // MergedConfig() returns the byte content of the patched configuration file (with .yaml.local). @@ -135,6 +141,11 @@ func NewConfig(reader io.Reader) (*BouncerConfig, error) { config.SetSize = 131072 } + // Health check defaults + if config.HealthConfig.CheckInterval == "" { + config.HealthConfig.CheckInterval = "30s" + } + if config.DisableIPV4 && config.DisableIPV6 && config.Mode != NftablesMode { // we return an error for pf or iptables because nftables has it own way to handle this return nil, errors.New("both IPv4 and IPv6 disabled, doing nothing") diff --git a/pkg/dryrun/dryrun.go b/pkg/dryrun/dryrun.go index de56260c..e30e5ac0 100644 --- a/pkg/dryrun/dryrun.go +++ b/pkg/dryrun/dryrun.go @@ -1,6 +1,8 @@ package dryrun import ( + "time" + log "github.com/sirupsen/logrus" "github.com/crowdsecurity/crowdsec/pkg/models" @@ -43,3 +45,12 @@ func (*dryRun) ShutDown() error { log.Infof("backend.ShutDown() called") return nil } + +func (*dryRun) CheckHealth() types.HealthStatus { + log.Infof("backend.CheckHealth() called") + return types.HealthStatus{ + Healthy: true, + Details: map[string]bool{}, + LastChecked: time.Now(), + } +} diff --git a/pkg/iptables/iptables.go b/pkg/iptables/iptables.go index bb9862dd..37c6e7c9 100644 --- a/pkg/iptables/iptables.go +++ b/pkg/iptables/iptables.go @@ -8,6 +8,7 @@ import ( "os/exec" "slices" "strings" + "time" log "github.com/sirupsen/logrus" @@ -268,3 +269,35 @@ func (ipt *iptables) Delete(decision *models.Decision) error { return nil } + +// CheckHealth verifies that the iptables infrastructure is intact. +func (ipt *iptables) CheckHealth() types.HealthStatus { + status := types.HealthStatus{ + Healthy: true, + Details: make(map[string]bool), + LastChecked: time.Now(), + } + + if ipt.v4 != nil { + v4Health := ipt.v4.checkHealth() + for k, v := range v4Health { + status.Details["v4_"+k] = v + if !v { + status.Healthy = false + } + } + } + + if ipt.v6 != nil { + v6Health := ipt.v6.checkHealth() + for k, v := range v6Health { + status.Details["v6_"+k] = v + if !v { + status.Healthy = false + } + } + } + + return status +} + diff --git a/pkg/iptables/iptables_context.go b/pkg/iptables/iptables_context.go index 2222c1a2..7b8ca3e9 100644 --- a/pkg/iptables/iptables_context.go +++ b/pkg/iptables/iptables_context.go @@ -398,3 +398,50 @@ func (ctx *ipTablesContext) delete(decision *models.Decision) error { ctx.toDel = append(ctx.toDel, decision) return nil } + +// jumpRuleExists checks if the jump rule to CROWDSEC_CHAIN exists in the given chain. +func (ctx *ipTablesContext) jumpRuleExists(chain string) bool { + cmd := exec.Command(ctx.iptablesBin, "-C", chain, "-j", chainName) + return cmd.Run() == nil +} + +// checkHealth verifies that the iptables infrastructure is intact. +func (ctx *ipTablesContext) checkHealth() map[string]bool { + health := make(map[string]bool) + + if ctx.ipsetContentOnly { + // In ipset-only mode, just check if the default ipset exists + if ctx.defaultSet != nil { + health["ipset_default"] = ctx.defaultSet.Exists() + } + // Also check any origin-specific sets + for origin, set := range ctx.ipsets { + if set != nil { + health["ipset_"+origin] = set.Exists() + } + } + } else { + // Full iptables mode: check chain and jump rules + health["chain_exists"] = ctx.chainExist(chainName) + + // Check if jump rules exist in configured chains + for _, chain := range ctx.Chains { + health["jump_"+chain] = ctx.jumpRuleExists(chain) + } + + // Check if logging chain exists (if logging is enabled) + if ctx.loggingEnabled { + health["logging_chain_exists"] = ctx.chainExist(loggingChainName) + } + + // Check ipsets exist + for origin, set := range ctx.ipsets { + if set != nil { + health["ipset_"+origin] = set.Exists() + } + } + } + + return health +} + diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 1427ed88..d23959a9 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -25,6 +25,7 @@ const ( ActiveBannedIPs metricName = "fw_bouncer_banned_ips" ) + type backendCollector interface { CollectMetrics() } diff --git a/pkg/nftables/nftables.go b/pkg/nftables/nftables.go index dbd97cfe..fca4d297 100644 --- a/pkg/nftables/nftables.go +++ b/pkg/nftables/nftables.go @@ -15,6 +15,7 @@ import ( "github.com/crowdsecurity/crowdsec/pkg/models" "github.com/crowdsecurity/cs-firewall-bouncer/pkg/cfg" + cstypes "github.com/crowdsecurity/cs-firewall-bouncer/pkg/types" ) const ( @@ -316,3 +317,13 @@ func (n *nft) ShutDown() error { return n.v6.shutDown() } + +func (*nft) CheckHealth() cstypes.HealthStatus { + // nftables health check - stub for now, returns healthy + // TODO: implement proper health check for nftables tables/chains + return cstypes.HealthStatus{ + Healthy: true, + Details: map[string]bool{}, + LastChecked: time.Now(), + } +} diff --git a/pkg/pf/pf.go b/pkg/pf/pf.go index 40be0171..184c336e 100644 --- a/pkg/pf/pf.go +++ b/pkg/pf/pf.go @@ -5,6 +5,7 @@ import ( "os" "os/exec" "strings" + "time" log "github.com/sirupsen/logrus" @@ -201,3 +202,13 @@ func (pf *pf) ShutDown() error { return nil } + +func (*pf) CheckHealth() types.HealthStatus { + // PF health check - stub for now, returns healthy + // TODO: implement proper health check for pf tables + return types.HealthStatus{ + Healthy: true, + Details: map[string]bool{}, + LastChecked: time.Now(), + } +} diff --git a/pkg/types/types.go b/pkg/types/types.go index 0b76d0f0..16020930 100644 --- a/pkg/types/types.go +++ b/pkg/types/types.go @@ -1,9 +1,19 @@ package types import ( + "time" + "github.com/crowdsecurity/crowdsec/pkg/models" ) +// HealthStatus represents the health state of a firewall backend. +type HealthStatus struct { + Healthy bool + Details map[string]bool // component name -> healthy status + LastChecked time.Time + Error error +} + type Backend interface { Init() error ShutDown() error @@ -11,4 +21,6 @@ type Backend interface { Delete(decision *models.Decision) error Commit() error CollectMetrics() + // CheckHealth verifies that the firewall infrastructure is intact. + CheckHealth() HealthStatus }