Error Handling and Cleanup Mechanisms
User Story
As a system administrator, I want robust error handling and cleanup so that failed executions don't consume resources or leave the system in an inconsistent state.
Technical Requirements
Acceptance Criteria
Definition of Done
Implementation Guide
Error Classification System
type ErrorType string
const (
ErrorTypeInfrastructure ErrorType = "infrastructure" // Docker daemon issues
ErrorTypeResource ErrorType = "resource" // Out of memory, CPU
ErrorTypeTimeout ErrorType = "timeout" // Execution timeout
ErrorTypeUserCode ErrorType = "user_code" // Script execution error
ErrorTypeValidation ErrorType = "validation" // Input validation
ErrorTypeSecurity ErrorType = "security" // Security violation
)
type ExecutionError struct {
Type ErrorType `json:"type"`
Code string `json:"code"`
Message string `json:"message"`
Details string `json:"details,omitempty"`
Retryable bool `json:"retryable"`
TaskID string `json:"task_id"`
ContainerID string `json:"container_id,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
func ClassifyError(err error, context string) *ExecutionError {
switch {
case strings.Contains(err.Error(), "no such container"):
return &ExecutionError{
Type: ErrorTypeInfrastructure,
Code: "CONTAINER_NOT_FOUND",
Message: "Container not found",
Retryable: false,
}
case strings.Contains(err.Error(), "timeout"):
return &ExecutionError{
Type: ErrorTypeTimeout,
Code: "EXECUTION_TIMEOUT",
Message: "Task execution timed out",
Retryable: true,
}
default:
return &ExecutionError{
Type: ErrorTypeInfrastructure,
Code: "UNKNOWN_ERROR",
Message: "Unknown execution error",
Retryable: true,
}
}
}
Comprehensive Cleanup System
type CleanupManager struct {
dockerClient *client.Client
logger *logrus.Logger
metrics *MetricsCollector
}
func (cm *CleanupManager) CleanupExecution(ctx context.Context, taskID, containerID string) error {
start := time.Now()
defer func() {
cm.metrics.RecordCleanupDuration(time.Since(start))
}()
var cleanupErrors []error
// Stop container (if running)
if containerID != "" {
if err := cm.stopContainer(ctx, containerID); err != nil {
cleanupErrors = append(cleanupErrors, fmt.Errorf("stop container: %w", err))
}
// Remove container
if err := cm.removeContainer(ctx, containerID); err != nil {
cleanupErrors = append(cleanupErrors, fmt.Errorf("remove container: %w", err))
}
}
// Clean up temporary files
if err := cm.cleanupTempFiles(taskID); err != nil {
cleanupErrors = append(cleanupErrors, fmt.Errorf("cleanup temp files: %w", err))
}
// Update metrics
cm.metrics.IncrementCleanupAttempts()
if len(cleanupErrors) == 0 {
cm.metrics.IncrementCleanupSuccess()
} else {
cm.metrics.IncrementCleanupFailures()
}
if len(cleanupErrors) > 0 {
return fmt.Errorf("cleanup failed with %d errors: %v", len(cleanupErrors), cleanupErrors)
}
return nil
}
func (cm *CleanupManager) stopContainer(ctx context.Context, containerID string) error {
timeout := 10 * time.Second
return cm.dockerClient.ContainerStop(ctx, containerID, &timeout)
}
func (cm *CleanupManager) removeContainer(ctx context.Context, containerID string) error {
return cm.dockerClient.ContainerRemove(ctx, containerID, types.ContainerRemoveOptions{
Force: true,
RemoveVolumes: true,
})
}
Resource Monitoring and Alerting
type ResourceMonitor struct {
dockerClient *client.Client
alertManager *AlertManager
thresholds ResourceThresholds
checkInterval time.Duration
}
type ResourceThresholds struct {
CPUPercent float64 // 80%
MemoryPercent float64 // 85%
DiskPercent float64 // 90%
ContainerCount int // 1000
}
func (rm *ResourceMonitor) MonitorResources(ctx context.Context) {
ticker := time.NewTicker(rm.checkInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if err := rm.checkResourceUsage(ctx); err != nil {
rm.alertManager.SendAlert("MONITOR_ERROR", err.Error())
}
}
}
}
func (rm *ResourceMonitor) checkResourceUsage(ctx context.Context) error {
// Check system resources
stats, err := rm.getSystemStats(ctx)
if err != nil {
return fmt.Errorf("failed to get system stats: %w", err)
}
// CPU usage check
if stats.CPUPercent > rm.thresholds.CPUPercent {
rm.alertManager.SendAlert("HIGH_CPU_USAGE",
fmt.Sprintf("CPU usage: %.2f%%", stats.CPUPercent))
}
// Memory usage check
if stats.MemoryPercent > rm.thresholds.MemoryPercent {
rm.alertManager.SendAlert("HIGH_MEMORY_USAGE",
fmt.Sprintf("Memory usage: %.2f%%", stats.MemoryPercent))
}
// Container count check
containers, err := rm.dockerClient.ContainerList(ctx, types.ContainerListOptions{})
if err != nil {
return fmt.Errorf("failed to list containers: %w", err)
}
if len(containers) > rm.thresholds.ContainerCount {
rm.alertManager.SendAlert("HIGH_CONTAINER_COUNT",
fmt.Sprintf("Container count: %d", len(containers)))
}
return nil
}
Retry Logic with Exponential Backoff
type RetryConfig struct {
MaxAttempts int
BaseDelay time.Duration
MaxDelay time.Duration
Multiplier float64
}
func (ee *ExecutionEngine) ExecuteWithRetry(ctx context.Context, task *Task) error {
config := RetryConfig{
MaxAttempts: 3,
BaseDelay: 1 * time.Second,
MaxDelay: 30 * time.Second,
Multiplier: 2.0,
}
var lastErr error
for attempt := 1; attempt <= config.MaxAttempts; attempt++ {
err := ee.executeTask(ctx, task)
if err == nil {
return nil
}
// Classify error
execErr := ClassifyError(err, "task_execution")
lastErr = execErr
// Don't retry non-retryable errors
if !execErr.Retryable {
return execErr
}
// Don't retry on last attempt
if attempt == config.MaxAttempts {
break
}
// Calculate delay with exponential backoff
delay := time.Duration(float64(config.BaseDelay) * math.Pow(config.Multiplier, float64(attempt-1)))
if delay > config.MaxDelay {
delay = config.MaxDelay
}
ee.logger.WithFields(logrus.Fields{
"task_id": task.ID,
"attempt": attempt,
"delay": delay,
"error": err.Error(),
}).Warn("Task execution failed, retrying")
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(delay):
// Continue to next attempt
}
}
return fmt.Errorf("task execution failed after %d attempts: %w", config.MaxAttempts, lastErr)
}
Graceful Degradation
- Circuit breaker pattern for external dependencies
- Queue size limits with rejection of new tasks
- Resource-aware task scheduling
- Priority-based task queuing during high load
- Health check endpoints for load balancers
Error Reporting and Metrics
- Structured error logging with correlation IDs
- Error rate metrics by type and code
- Container cleanup success/failure rates
- Resource usage trends and alerts
- Performance degradation indicators
Related Epic
Contributes to Epic #8: Container Execution Engine
Error Handling and Cleanup Mechanisms
User Story
As a system administrator, I want robust error handling and cleanup so that failed executions don't consume resources or leave the system in an inconsistent state.
Technical Requirements
Acceptance Criteria
Definition of Done
Implementation Guide
Error Classification System
Comprehensive Cleanup System
Resource Monitoring and Alerting
Retry Logic with Exponential Backoff
Graceful Degradation
Error Reporting and Metrics
Related Epic
Contributes to Epic #8: Container Execution Engine