container lifecycle management

This commit is contained in:
2026-03-12 15:13:38 -04:00
parent e99ef5d2dd
commit b9cc397e05
61 changed files with 6880 additions and 31 deletions

View File

@@ -0,0 +1,230 @@
"""
Container lifecycle manager for agent containers.
Tracks activity and triggers to determine when the container should shut down.
Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar.
"""
import asyncio
import logging
import os
import signal
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Set
logger = logging.getLogger(__name__)
# Exit code to signal clean idle shutdown to sidecar
EXIT_CODE_IDLE_SHUTDOWN = 42
# File to write exit code for sidecar to read
EXIT_CODE_FILE = Path("/var/run/agent/exit_code")
class LifecycleManager:
"""
Manages container lifecycle based on activity and triggers.
The container shuts itself down when:
1. No active triggers (data subscriptions, CEP patterns, etc.)
2. No recent user activity (MCP calls)
3. Idle timeout has elapsed
"""
def __init__(
self,
idle_timeout_minutes: int = 15,
check_interval_seconds: int = 60,
enable_shutdown: bool = True,
):
"""
Initialize lifecycle manager.
Args:
idle_timeout_minutes: Minutes of inactivity before shutdown
check_interval_seconds: Interval between idle checks
enable_shutdown: If False, only log idle state without exiting (for testing)
"""
self.idle_timeout = timedelta(minutes=idle_timeout_minutes)
self.check_interval = check_interval_seconds
self.enable_shutdown = enable_shutdown
self.last_activity: datetime = datetime.now()
self.active_triggers: Set[str] = set()
self._running = False
self._check_task: Optional[asyncio.Task] = None
logger.info(
"Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s",
idle_timeout_minutes,
check_interval_seconds,
enable_shutdown,
)
def record_activity(self) -> None:
"""
Record user activity (called on MCP tool/resource/prompt invocations).
Resets the idle timer.
"""
self.last_activity = datetime.now()
logger.debug("Activity recorded, idle timer reset")
def update_triggers(self, triggers: Set[str]) -> None:
"""
Update the set of active triggers.
Args:
triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.)
"""
if triggers != self.active_triggers:
added = triggers - self.active_triggers
removed = self.active_triggers - triggers
if added:
logger.info("Triggers added: %s", added)
if removed:
logger.info("Triggers removed: %s", removed)
self.active_triggers = triggers
logger.info("Active triggers: %d", len(self.active_triggers))
def add_trigger(self, trigger_id: str) -> None:
"""Add a single trigger."""
if trigger_id not in self.active_triggers:
self.active_triggers.add(trigger_id)
logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers))
def remove_trigger(self, trigger_id: str) -> None:
"""Remove a single trigger."""
if trigger_id in self.active_triggers:
self.active_triggers.remove(trigger_id)
logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers))
def is_idle(self) -> bool:
"""
Check if container is idle and should shut down.
Returns:
True if no triggers and idle timeout exceeded
"""
has_triggers = len(self.active_triggers) > 0
idle_time = datetime.now() - self.last_activity
is_past_timeout = idle_time > self.idle_timeout
if has_triggers:
logger.debug("Not idle: has %d active triggers", len(self.active_triggers))
return False
if not is_past_timeout:
logger.debug(
"Not idle: last activity %s ago (timeout: %s)",
idle_time,
self.idle_timeout,
)
return False
logger.info(
"Container is idle: no triggers and %s since last activity", idle_time
)
return True
async def start(self) -> None:
"""Start the lifecycle manager background task."""
if self._running:
logger.warning("Lifecycle manager already running")
return
self._running = True
self._check_task = asyncio.create_task(self._check_loop())
logger.info("Lifecycle manager started")
async def stop(self) -> None:
"""Stop the lifecycle manager."""
self._running = False
if self._check_task:
self._check_task.cancel()
try:
await self._check_task
except asyncio.CancelledError:
pass
logger.info("Lifecycle manager stopped")
async def _check_loop(self) -> None:
"""Background task that periodically checks if container should shut down."""
while self._running:
try:
await asyncio.sleep(self.check_interval)
if self.is_idle():
if self.enable_shutdown:
logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN)
self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN)
# Give sidecar a moment to see the exit code file
await asyncio.sleep(1)
# Exit with special code
os._exit(EXIT_CODE_IDLE_SHUTDOWN)
else:
logger.info(
"Container is idle but shutdown is disabled (testing mode)"
)
except asyncio.CancelledError:
logger.info("Check loop cancelled")
raise
except Exception as e:
logger.error("Error in lifecycle check loop: %s", e, exc_info=True)
def _write_exit_code(self, code: int) -> None:
"""Write exit code to shared file for sidecar to read."""
try:
EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True)
EXIT_CODE_FILE.write_text(str(code))
logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE)
except Exception as e:
logger.warning("Failed to write exit code file: %s", e)
def setup_signal_handlers(self) -> None:
"""
Setup signal handlers for graceful shutdown.
On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart.
"""
def signal_handler(signum, frame):
logger.info("Received signal %d, exiting normally", signum)
sys.exit(0)
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
# Singleton instance for easy access across the application
_lifecycle_manager: Optional[LifecycleManager] = None
def get_lifecycle_manager() -> LifecycleManager:
"""Get or create the global lifecycle manager instance."""
global _lifecycle_manager
if _lifecycle_manager is None:
# Load configuration from environment
idle_timeout = int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15"))
check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60"))
enable_shutdown = os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
_lifecycle_manager = LifecycleManager(
idle_timeout_minutes=idle_timeout,
check_interval_seconds=check_interval,
enable_shutdown=enable_shutdown,
)
return _lifecycle_manager
async def start_lifecycle_manager() -> LifecycleManager:
"""Initialize and start the lifecycle manager."""
manager = get_lifecycle_manager()
manager.setup_signal_handlers()
await manager.start()
return manager

View File

@@ -0,0 +1,43 @@
# openclaw/auth.py
class MCPAuthMiddleware:
"""Authenticates incoming MCP connections based on configured mode."""
def __init__(self, config: AuthConfig):
self.config = config
self._jwks_client = None # lazy-loaded for platform mode
async def authenticate(self, request) -> AuthContext:
match self.config.mode:
case "local":
# stdio transport or localhost-only binding
# No auth needed — if you can exec into the container,
# you're the user
return AuthContext(user_id=self.config.local_user_id,
source="local")
case "token":
# User-generated API key (standalone remote access)
token = extract_bearer_token(request)
if not verify_token_hash(token, self.config.tokens):
raise AuthError("Invalid API token")
return AuthContext(user_id=self.config.local_user_id,
source="api_key")
case "platform":
# JWT signed by the OpenClaw platform
token = extract_bearer_token(request)
claims = await self._verify_platform_jwt(token)
if claims["sub"] != self.config.expected_user_id:
raise AuthError("User ID mismatch")
return AuthContext(user_id=claims["sub"],
source="platform",
scopes=claims.get("scopes", []))
async def _verify_platform_jwt(self, token: str) -> dict:
if not self._jwks_client:
self._jwks_client = JWKSClient(self.config.platform_jwks_url)
signing_key = await self._jwks_client.get_signing_key_from_jwt(token)
return jwt.decode(token, signing_key.key,
algorithms=["RS256"],
audience="openclaw-mcp")