Files
ai/lifecycle-sidecar/main.go

236 lines
6.7 KiB
Go

package main
import (
"context"
"fmt"
"os"
"os/exec"
"os/signal"
"syscall"
"time"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
const (
// Exit code indicating clean idle shutdown
ExitCodeIdleShutdown = 42
// Poll interval for checking main container status
PollInterval = 5 * time.Second
)
func main() {
// Setup logging
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
log.Info().Msg("Lifecycle sidecar starting")
// Get environment configuration
namespace := os.Getenv("NAMESPACE")
deploymentName := os.Getenv("DEPLOYMENT_NAME")
userType := os.Getenv("USER_TYPE")
mainContainerPID := os.Getenv("MAIN_CONTAINER_PID")
if namespace == "" || deploymentName == "" {
log.Fatal().Msg("NAMESPACE and DEPLOYMENT_NAME environment variables are required")
}
log.Info().
Str("namespace", namespace).
Str("deployment", deploymentName).
Str("userType", userType).
Str("mainPID", mainContainerPID).
Msg("Configuration loaded")
// Create Kubernetes client
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal().Err(err).Msg("Failed to get in-cluster config")
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create Kubernetes client")
}
// Wait for main container to exit
exitCode := waitForMainContainer()
log.Info().Int("exitCode", exitCode).Msg("Main container exited")
// Handle exit code
if exitCode == ExitCodeIdleShutdown {
log.Info().Msg("Detected idle shutdown (exit code 42) - cleaning up deployment")
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Delete PVC if anonymous user
deletePVC := userType == "anonymous" || userType == "temporary"
if err := cleanupDeployment(ctx, clientset, namespace, deploymentName, deletePVC); err != nil {
log.Error().Err(err).Msg("Failed to cleanup deployment")
os.Exit(1)
}
log.Info().Msg("Cleanup complete - sidecar exiting")
os.Exit(0)
} else {
// Any other exit code - let Kubernetes restart policy handle it
log.Info().
Int("exitCode", exitCode).
Msg("Non-idle exit code - allowing Kubernetes to handle restart")
os.Exit(exitCode)
}
}
// waitForMainContainer monitors the main container process and returns its exit code
func waitForMainContainer() int {
// Try multiple methods to detect main container exit
// Method 1: Poll for process via shared PID namespace
mainPID := os.Getenv("MAIN_CONTAINER_PID")
if mainPID != "" {
return pollProcessExit(mainPID)
}
// Method 2: Poll for agent process by name (fallback)
log.Info().Msg("MAIN_CONTAINER_PID not set, polling for 'agent' process")
return pollProcessByName("agent")
}
// pollProcessExit polls for process exit by PID
func pollProcessExit(pidStr string) int {
log.Info().Str("pid", pidStr).Msg("Monitoring main container process")
for {
// Check if process exists
cmd := exec.Command("kill", "-0", pidStr)
err := cmd.Run()
if err != nil {
// Process no longer exists - get exit code from /proc if available
log.Info().Msg("Main container process exited")
// Try to get actual exit code (this is a best-effort)
// In Kubernetes, we might not have access to the actual exit code
// So we check if the container restarted via container status
return getContainerExitCode()
}
time.Sleep(PollInterval)
}
}
// pollProcessByName polls for process exit by name
func pollProcessByName(name string) int {
log.Info().Str("name", name).Msg("Monitoring main container by name")
for {
cmd := exec.Command("pgrep", "-x", name)
err := cmd.Run()
if err != nil {
log.Info().Msg("Main container process exited")
return getContainerExitCode()
}
time.Sleep(PollInterval)
}
}
// getContainerExitCode attempts to retrieve the exit code of the main container
// This is challenging in Kubernetes without direct access to container runtime
// We use a fallback approach: check a shared file or default to 0
func getContainerExitCode() int {
// Check if main container wrote exit code to shared volume
exitCodeFile := "/var/run/agent/exit_code"
data, err := os.ReadFile(exitCodeFile)
if err == nil {
var exitCode int
_, err := fmt.Sscanf(string(data), "%d", &exitCode)
if err == nil {
log.Info().Int("exitCode", exitCode).Msg("Read exit code from shared file")
return exitCode
}
}
// Default to 0 if we can't determine exit code
// This is safe because non-42 codes allow restart
log.Warn().Msg("Could not determine exit code, defaulting to 0")
return 0
}
// cleanupDeployment deletes the deployment and optionally the PVC
func cleanupDeployment(ctx context.Context, clientset *kubernetes.Clientset, namespace, deploymentName string, deletePVC bool) error {
log.Info().
Str("namespace", namespace).
Str("deployment", deploymentName).
Bool("deletePVC", deletePVC).
Msg("Cleaning up deployment")
// Get deployment to find PVC name if needed
var pvcName string
if deletePVC {
deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
if err != nil {
log.Warn().Err(err).Msg("Could not get deployment for PVC lookup")
} else {
// Find PVC from volume claim templates or volumes
if len(deployment.Spec.Template.Spec.Volumes) > 0 {
for _, vol := range deployment.Spec.Template.Spec.Volumes {
if vol.PersistentVolumeClaim != nil {
pvcName = vol.PersistentVolumeClaim.ClaimName
break
}
}
}
}
}
// Delete deployment
deletePolicy := metav1.DeletePropagationForeground
deleteOptions := metav1.DeleteOptions{
PropagationPolicy: &deletePolicy,
}
log.Info().Str("deployment", deploymentName).Msg("Deleting deployment")
err := clientset.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, deleteOptions)
if err != nil {
return fmt.Errorf("failed to delete deployment: %w", err)
}
log.Info().Msg("Deployment deleted successfully")
// Delete PVC if requested and found
if deletePVC && pvcName != "" {
log.Info().Str("pvc", pvcName).Msg("Deleting PVC")
err := clientset.CoreV1().PersistentVolumeClaims(namespace).Delete(ctx, pvcName, metav1.DeleteOptions{})
if err != nil {
log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to delete PVC (non-fatal)")
} else {
log.Info().Msg("PVC deleted successfully")
}
}
return nil
}
func init() {
// Register signal handler for graceful shutdown
// If sidecar receives SIGTERM, just exit cleanly
// Don't trigger deployment deletion on sidecar termination
go func() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGTERM)
<-sigChan
log.Info().Msg("Received SIGTERM - sidecar exiting without cleanup")
os.Exit(0)
}()
}