container lifecycle management

This commit is contained in:
2026-03-12 15:13:38 -04:00
parent e99ef5d2dd
commit b9cc397e05
61 changed files with 6880 additions and 31 deletions

15
lifecycle-sidecar/.gitignore vendored Normal file
View File

@@ -0,0 +1,15 @@
# Binaries
lifecycle-sidecar
*.exe
*.dll
*.so
*.dylib
# Test binary
*.test
# Go workspace file
go.work
# Build output
dist/

View File

@@ -0,0 +1,40 @@
# Build stage
FROM golang:1.22-alpine AS builder
WORKDIR /app
# Install build dependencies
RUN apk add --no-cache git ca-certificates
# Copy go mod files
COPY go.mod go.sum ./
RUN go mod download
# Copy source
COPY main.go ./
# Build static binary
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-ldflags="-w -s" \
-o lifecycle-sidecar \
main.go
# Runtime stage
FROM alpine:3.19
# Install procps for process monitoring (pgrep, kill)
RUN apk add --no-cache procps ca-certificates
# Create non-root user
RUN addgroup -g 1000 sidecar && \
adduser -D -u 1000 -G sidecar sidecar
WORKDIR /app
# Copy binary from builder
COPY --from=builder /app/lifecycle-sidecar /app/lifecycle-sidecar
# Run as non-root
USER sidecar
ENTRYPOINT ["/app/lifecycle-sidecar"]

View File

@@ -0,0 +1,94 @@
# Lifecycle Sidecar
A lightweight Kubernetes sidecar that monitors the main agent container and handles cleanup when the container exits with a specific exit code indicating idle shutdown.
## Purpose
User agent containers self-manage their lifecycle by:
1. Tracking their own activity (MCP calls, trigger status)
2. Exiting with code `42` when idle (no triggers + no recent activity)
3. Delegating deployment cleanup to this sidecar
The sidecar watches the main container and:
- On exit code `42`: Deletes the deployment (and optionally PVC)
- On any other exit code: Allows Kubernetes restart policy to handle it
## Architecture
```
┌─────────────────────────────────────────────────┐
│ Pod │
│ ┌────────────────┐ ┌──────────────────┐ │
│ │ Agent Container│ │ Lifecycle Sidecar│ │
│ │ │ │ │ │
│ │ - Track activity │ - Monitor agent │ │
│ │ - Track triggers │ - Watch exit code│ │
│ │ - Exit 42 if idle │ - Delete if 42 │ │
│ └────────────────┘ └──────────────────┘ │
│ │ │ │
│ │ writes exit_code │ │
│ └─────────►/var/run/agent/exit_code │
│ │ │
└───────────────────────────────────┼─────────────┘
▼ k8s API
┌──────────────────────┐
│ Delete Deployment │
│ (+ PVC if anonymous)│
└──────────────────────┘
```
## Environment Variables
| Variable | Required | Description |
|----------|----------|-------------|
| `NAMESPACE` | Yes | Kubernetes namespace (injected via downward API) |
| `DEPLOYMENT_NAME` | Yes | Name of the deployment to delete (from pod label) |
| `USER_TYPE` | No | User license tier: `anonymous`, `free`, `paid`, `enterprise` |
| `MAIN_CONTAINER_PID` | No | PID of main container (for precise monitoring) |
## Exit Code Contract
The agent container uses exit codes to signal intent:
| Exit Code | Meaning | Sidecar Action |
|-----------|---------|----------------|
| `42` | Clean idle shutdown | Delete deployment + optional PVC |
| Any other | Error or normal restart | Allow Kubernetes to restart |
## RBAC Requirements
The sidecar requires a ServiceAccount with permission to delete its own deployment:
```yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "delete"]
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "delete"]
```
See `deploy/k8s/base/lifecycle-sidecar-rbac.yaml` for the full RBAC configuration.
## Building
```bash
docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
docker push ghcr.io/dexorder/lifecycle-sidecar:latest
```
## Example Usage
See `deploy/k8s/base/agent-deployment-example.yaml` for a complete example of how to configure an agent deployment with the lifecycle sidecar.
## Security Considerations
1. **Self-delete only**: The sidecar can only delete the deployment it's part of (enforced by label matching in admission policy)
2. **Non-privileged**: Runs as non-root user (UID 1000)
3. **Minimal permissions**: Only has `get` and `delete` on deployments/PVCs in the agents namespace
4. **No cross-namespace access**: Scoped to `dexorder-agents` namespace only
5. **Crash-safe**: Only triggers cleanup on exit code 42, never on crashes

16
lifecycle-sidecar/go.mod Normal file
View File

@@ -0,0 +1,16 @@
module github.com/dexorder/lifecycle-sidecar
go 1.22
require (
github.com/rs/zerolog v1.32.0
k8s.io/api v0.29.2
k8s.io/apimachinery v0.29.2
k8s.io/client-go v0.29.2
)
require (
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
golang.org/x/sys v0.17.0 // indirect
)

234
lifecycle-sidecar/main.go Normal file
View File

@@ -0,0 +1,234 @@
package main
import (
"context"
"fmt"
"os"
"os/exec"
"syscall"
"time"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
const (
// Exit code indicating clean idle shutdown
ExitCodeIdleShutdown = 42
// Poll interval for checking main container status
PollInterval = 5 * time.Second
)
func main() {
// Setup logging
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
log.Info().Msg("Lifecycle sidecar starting")
// Get environment configuration
namespace := os.Getenv("NAMESPACE")
deploymentName := os.Getenv("DEPLOYMENT_NAME")
userType := os.Getenv("USER_TYPE")
mainContainerPID := os.Getenv("MAIN_CONTAINER_PID")
if namespace == "" || deploymentName == "" {
log.Fatal().Msg("NAMESPACE and DEPLOYMENT_NAME environment variables are required")
}
log.Info().
Str("namespace", namespace).
Str("deployment", deploymentName).
Str("userType", userType).
Str("mainPID", mainContainerPID).
Msg("Configuration loaded")
// Create Kubernetes client
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal().Err(err).Msg("Failed to get in-cluster config")
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create Kubernetes client")
}
// Wait for main container to exit
exitCode := waitForMainContainer()
log.Info().Int("exitCode", exitCode).Msg("Main container exited")
// Handle exit code
if exitCode == ExitCodeIdleShutdown {
log.Info().Msg("Detected idle shutdown (exit code 42) - cleaning up deployment")
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Delete PVC if anonymous user
deletePVC := userType == "anonymous" || userType == "temporary"
if err := cleanupDeployment(ctx, clientset, namespace, deploymentName, deletePVC); err != nil {
log.Error().Err(err).Msg("Failed to cleanup deployment")
os.Exit(1)
}
log.Info().Msg("Cleanup complete - sidecar exiting")
os.Exit(0)
} else {
// Any other exit code - let Kubernetes restart policy handle it
log.Info().
Int("exitCode", exitCode).
Msg("Non-idle exit code - allowing Kubernetes to handle restart")
os.Exit(exitCode)
}
}
// waitForMainContainer monitors the main container process and returns its exit code
func waitForMainContainer() int {
// Try multiple methods to detect main container exit
// Method 1: Poll for process via shared PID namespace
mainPID := os.Getenv("MAIN_CONTAINER_PID")
if mainPID != "" {
return pollProcessExit(mainPID)
}
// Method 2: Poll for agent process by name (fallback)
log.Info().Msg("MAIN_CONTAINER_PID not set, polling for 'agent' process")
return pollProcessByName("agent")
}
// pollProcessExit polls for process exit by PID
func pollProcessExit(pidStr string) int {
log.Info().Str("pid", pidStr).Msg("Monitoring main container process")
for {
// Check if process exists
cmd := exec.Command("kill", "-0", pidStr)
err := cmd.Run()
if err != nil {
// Process no longer exists - get exit code from /proc if available
log.Info().Msg("Main container process exited")
// Try to get actual exit code (this is a best-effort)
// In Kubernetes, we might not have access to the actual exit code
// So we check if the container restarted via container status
return getContainerExitCode()
}
time.Sleep(PollInterval)
}
}
// pollProcessByName polls for process exit by name
func pollProcessByName(name string) int {
log.Info().Str("name", name).Msg("Monitoring main container by name")
for {
cmd := exec.Command("pgrep", "-x", name)
err := cmd.Run()
if err != nil {
log.Info().Msg("Main container process exited")
return getContainerExitCode()
}
time.Sleep(PollInterval)
}
}
// getContainerExitCode attempts to retrieve the exit code of the main container
// This is challenging in Kubernetes without direct access to container runtime
// We use a fallback approach: check a shared file or default to 0
func getContainerExitCode() int {
// Check if main container wrote exit code to shared volume
exitCodeFile := "/var/run/agent/exit_code"
data, err := os.ReadFile(exitCodeFile)
if err == nil {
var exitCode int
_, err := fmt.Sscanf(string(data), "%d", &exitCode)
if err == nil {
log.Info().Int("exitCode", exitCode).Msg("Read exit code from shared file")
return exitCode
}
}
// Default to 0 if we can't determine exit code
// This is safe because non-42 codes allow restart
log.Warn().Msg("Could not determine exit code, defaulting to 0")
return 0
}
// cleanupDeployment deletes the deployment and optionally the PVC
func cleanupDeployment(ctx context.Context, clientset *kubernetes.Clientset, namespace, deploymentName string, deletePVC bool) error {
log.Info().
Str("namespace", namespace).
Str("deployment", deploymentName).
Bool("deletePVC", deletePVC).
Msg("Cleaning up deployment")
// Get deployment to find PVC name if needed
var pvcName string
if deletePVC {
deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
if err != nil {
log.Warn().Err(err).Msg("Could not get deployment for PVC lookup")
} else {
// Find PVC from volume claim templates or volumes
if len(deployment.Spec.Template.Spec.Volumes) > 0 {
for _, vol := range deployment.Spec.Template.Spec.Volumes {
if vol.PersistentVolumeClaim != nil {
pvcName = vol.PersistentVolumeClaim.ClaimName
break
}
}
}
}
}
// Delete deployment
deletePolicy := metav1.DeletePropagationForeground
deleteOptions := metav1.DeleteOptions{
PropagationPolicy: &deletePolicy,
}
log.Info().Str("deployment", deploymentName).Msg("Deleting deployment")
err := clientset.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, deleteOptions)
if err != nil {
return fmt.Errorf("failed to delete deployment: %w", err)
}
log.Info().Msg("Deployment deleted successfully")
// Delete PVC if requested and found
if deletePVC && pvcName != "" {
log.Info().Str("pvc", pvcName).Msg("Deleting PVC")
err := clientset.CoreV1().PersistentVolumeClaims(namespace).Delete(ctx, pvcName, metav1.DeleteOptions{})
if err != nil {
log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to delete PVC (non-fatal)")
} else {
log.Info().Msg("PVC deleted successfully")
}
}
return nil
}
func init() {
// Register signal handler for graceful shutdown
// If sidecar receives SIGTERM, just exit cleanly
// Don't trigger deployment deletion on sidecar termination
go func() {
sigChan := make(chan os.Signal, 1)
syscall.Signal(syscall.SIGTERM)
<-sigChan
log.Info().Msg("Received SIGTERM - sidecar exiting without cleanup")
os.Exit(0)
}()
}