container lifecycle management
This commit is contained in:
230
client-py/dexorder/lifecycle_manager.py
Normal file
230
client-py/dexorder/lifecycle_manager.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Container lifecycle manager for agent containers.
|
||||
|
||||
Tracks activity and triggers to determine when the container should shut down.
|
||||
Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional, Set
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Exit code to signal clean idle shutdown to sidecar
|
||||
EXIT_CODE_IDLE_SHUTDOWN = 42
|
||||
|
||||
# File to write exit code for sidecar to read
|
||||
EXIT_CODE_FILE = Path("/var/run/agent/exit_code")
|
||||
|
||||
|
||||
class LifecycleManager:
|
||||
"""
|
||||
Manages container lifecycle based on activity and triggers.
|
||||
|
||||
The container shuts itself down when:
|
||||
1. No active triggers (data subscriptions, CEP patterns, etc.)
|
||||
2. No recent user activity (MCP calls)
|
||||
3. Idle timeout has elapsed
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
idle_timeout_minutes: int = 15,
|
||||
check_interval_seconds: int = 60,
|
||||
enable_shutdown: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize lifecycle manager.
|
||||
|
||||
Args:
|
||||
idle_timeout_minutes: Minutes of inactivity before shutdown
|
||||
check_interval_seconds: Interval between idle checks
|
||||
enable_shutdown: If False, only log idle state without exiting (for testing)
|
||||
"""
|
||||
self.idle_timeout = timedelta(minutes=idle_timeout_minutes)
|
||||
self.check_interval = check_interval_seconds
|
||||
self.enable_shutdown = enable_shutdown
|
||||
|
||||
self.last_activity: datetime = datetime.now()
|
||||
self.active_triggers: Set[str] = set()
|
||||
self._running = False
|
||||
self._check_task: Optional[asyncio.Task] = None
|
||||
|
||||
logger.info(
|
||||
"Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s",
|
||||
idle_timeout_minutes,
|
||||
check_interval_seconds,
|
||||
enable_shutdown,
|
||||
)
|
||||
|
||||
def record_activity(self) -> None:
|
||||
"""
|
||||
Record user activity (called on MCP tool/resource/prompt invocations).
|
||||
Resets the idle timer.
|
||||
"""
|
||||
self.last_activity = datetime.now()
|
||||
logger.debug("Activity recorded, idle timer reset")
|
||||
|
||||
def update_triggers(self, triggers: Set[str]) -> None:
|
||||
"""
|
||||
Update the set of active triggers.
|
||||
|
||||
Args:
|
||||
triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.)
|
||||
"""
|
||||
if triggers != self.active_triggers:
|
||||
added = triggers - self.active_triggers
|
||||
removed = self.active_triggers - triggers
|
||||
|
||||
if added:
|
||||
logger.info("Triggers added: %s", added)
|
||||
if removed:
|
||||
logger.info("Triggers removed: %s", removed)
|
||||
|
||||
self.active_triggers = triggers
|
||||
logger.info("Active triggers: %d", len(self.active_triggers))
|
||||
|
||||
def add_trigger(self, trigger_id: str) -> None:
|
||||
"""Add a single trigger."""
|
||||
if trigger_id not in self.active_triggers:
|
||||
self.active_triggers.add(trigger_id)
|
||||
logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers))
|
||||
|
||||
def remove_trigger(self, trigger_id: str) -> None:
|
||||
"""Remove a single trigger."""
|
||||
if trigger_id in self.active_triggers:
|
||||
self.active_triggers.remove(trigger_id)
|
||||
logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers))
|
||||
|
||||
def is_idle(self) -> bool:
|
||||
"""
|
||||
Check if container is idle and should shut down.
|
||||
|
||||
Returns:
|
||||
True if no triggers and idle timeout exceeded
|
||||
"""
|
||||
has_triggers = len(self.active_triggers) > 0
|
||||
idle_time = datetime.now() - self.last_activity
|
||||
is_past_timeout = idle_time > self.idle_timeout
|
||||
|
||||
if has_triggers:
|
||||
logger.debug("Not idle: has %d active triggers", len(self.active_triggers))
|
||||
return False
|
||||
|
||||
if not is_past_timeout:
|
||||
logger.debug(
|
||||
"Not idle: last activity %s ago (timeout: %s)",
|
||||
idle_time,
|
||||
self.idle_timeout,
|
||||
)
|
||||
return False
|
||||
|
||||
logger.info(
|
||||
"Container is idle: no triggers and %s since last activity", idle_time
|
||||
)
|
||||
return True
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Start the lifecycle manager background task."""
|
||||
if self._running:
|
||||
logger.warning("Lifecycle manager already running")
|
||||
return
|
||||
|
||||
self._running = True
|
||||
self._check_task = asyncio.create_task(self._check_loop())
|
||||
logger.info("Lifecycle manager started")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stop the lifecycle manager."""
|
||||
self._running = False
|
||||
if self._check_task:
|
||||
self._check_task.cancel()
|
||||
try:
|
||||
await self._check_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
logger.info("Lifecycle manager stopped")
|
||||
|
||||
async def _check_loop(self) -> None:
|
||||
"""Background task that periodically checks if container should shut down."""
|
||||
while self._running:
|
||||
try:
|
||||
await asyncio.sleep(self.check_interval)
|
||||
|
||||
if self.is_idle():
|
||||
if self.enable_shutdown:
|
||||
logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN)
|
||||
self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN)
|
||||
|
||||
# Give sidecar a moment to see the exit code file
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# Exit with special code
|
||||
os._exit(EXIT_CODE_IDLE_SHUTDOWN)
|
||||
else:
|
||||
logger.info(
|
||||
"Container is idle but shutdown is disabled (testing mode)"
|
||||
)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("Check loop cancelled")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Error in lifecycle check loop: %s", e, exc_info=True)
|
||||
|
||||
def _write_exit_code(self, code: int) -> None:
|
||||
"""Write exit code to shared file for sidecar to read."""
|
||||
try:
|
||||
EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
EXIT_CODE_FILE.write_text(str(code))
|
||||
logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to write exit code file: %s", e)
|
||||
|
||||
def setup_signal_handlers(self) -> None:
|
||||
"""
|
||||
Setup signal handlers for graceful shutdown.
|
||||
On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart.
|
||||
"""
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
logger.info("Received signal %d, exiting normally", signum)
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
|
||||
# Singleton instance for easy access across the application
|
||||
_lifecycle_manager: Optional[LifecycleManager] = None
|
||||
|
||||
|
||||
def get_lifecycle_manager() -> LifecycleManager:
|
||||
"""Get or create the global lifecycle manager instance."""
|
||||
global _lifecycle_manager
|
||||
if _lifecycle_manager is None:
|
||||
# Load configuration from environment
|
||||
idle_timeout = int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15"))
|
||||
check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60"))
|
||||
enable_shutdown = os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
|
||||
|
||||
_lifecycle_manager = LifecycleManager(
|
||||
idle_timeout_minutes=idle_timeout,
|
||||
check_interval_seconds=check_interval,
|
||||
enable_shutdown=enable_shutdown,
|
||||
)
|
||||
return _lifecycle_manager
|
||||
|
||||
|
||||
async def start_lifecycle_manager() -> LifecycleManager:
|
||||
"""Initialize and start the lifecycle manager."""
|
||||
manager = get_lifecycle_manager()
|
||||
manager.setup_signal_handlers()
|
||||
await manager.start()
|
||||
return manager
|
||||
43
client-py/dexorder/mcp_auth_middleware.py
Normal file
43
client-py/dexorder/mcp_auth_middleware.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# openclaw/auth.py
|
||||
|
||||
class MCPAuthMiddleware:
|
||||
"""Authenticates incoming MCP connections based on configured mode."""
|
||||
|
||||
def __init__(self, config: AuthConfig):
|
||||
self.config = config
|
||||
self._jwks_client = None # lazy-loaded for platform mode
|
||||
|
||||
async def authenticate(self, request) -> AuthContext:
|
||||
match self.config.mode:
|
||||
case "local":
|
||||
# stdio transport or localhost-only binding
|
||||
# No auth needed — if you can exec into the container,
|
||||
# you're the user
|
||||
return AuthContext(user_id=self.config.local_user_id,
|
||||
source="local")
|
||||
|
||||
case "token":
|
||||
# User-generated API key (standalone remote access)
|
||||
token = extract_bearer_token(request)
|
||||
if not verify_token_hash(token, self.config.tokens):
|
||||
raise AuthError("Invalid API token")
|
||||
return AuthContext(user_id=self.config.local_user_id,
|
||||
source="api_key")
|
||||
|
||||
case "platform":
|
||||
# JWT signed by the OpenClaw platform
|
||||
token = extract_bearer_token(request)
|
||||
claims = await self._verify_platform_jwt(token)
|
||||
if claims["sub"] != self.config.expected_user_id:
|
||||
raise AuthError("User ID mismatch")
|
||||
return AuthContext(user_id=claims["sub"],
|
||||
source="platform",
|
||||
scopes=claims.get("scopes", []))
|
||||
|
||||
async def _verify_platform_jwt(self, token: str) -> dict:
|
||||
if not self._jwks_client:
|
||||
self._jwks_client = JWKSClient(self.config.platform_jwks_url)
|
||||
signing_key = await self._jwks_client.get_signing_key_from_jwt(token)
|
||||
return jwt.decode(token, signing_key.key,
|
||||
algorithms=["RS256"],
|
||||
audience="openclaw-mcp")
|
||||
Reference in New Issue
Block a user