Files
ai/sandbox/dexorder/lifecycle_manager.py

241 lines
8.4 KiB
Python

"""
Container lifecycle manager for agent containers.
Tracks activity and triggers to determine when the container should shut down.
Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar.
"""
import asyncio
import logging
import os
import signal
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Set
logger = logging.getLogger(__name__)
# Exit code to signal clean idle shutdown to sidecar
EXIT_CODE_IDLE_SHUTDOWN = 42
# File to write exit code for sidecar to read
EXIT_CODE_FILE = Path("/var/run/agent/exit_code")
class LifecycleManager:
"""
Manages container lifecycle based on activity and triggers.
The container shuts itself down when:
1. No active triggers (data subscriptions, CEP patterns, etc.)
2. No recent user activity (MCP calls)
3. Idle timeout has elapsed
"""
def __init__(
self,
idle_timeout_minutes: int = 15,
check_interval_seconds: int = 60,
enable_shutdown: bool = True,
):
"""
Initialize lifecycle manager.
Args:
idle_timeout_minutes: Minutes of inactivity before shutdown
check_interval_seconds: Interval between idle checks
enable_shutdown: If False, only log idle state without exiting (for testing)
"""
self.idle_timeout = timedelta(minutes=idle_timeout_minutes)
self.check_interval = check_interval_seconds
self.enable_shutdown = enable_shutdown
self.last_activity: datetime = datetime.now()
self.active_triggers: Set[str] = set()
self._running = False
self._check_task: Optional[asyncio.Task] = None
logger.info(
"Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s",
idle_timeout_minutes,
check_interval_seconds,
enable_shutdown,
)
def record_activity(self) -> None:
"""
Record user activity (called on MCP tool/resource/prompt invocations).
Resets the idle timer.
"""
self.last_activity = datetime.now()
logger.debug("Activity recorded, idle timer reset")
def update_triggers(self, triggers: Set[str]) -> None:
"""
Update the set of active triggers.
Args:
triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.)
"""
if triggers != self.active_triggers:
added = triggers - self.active_triggers
removed = self.active_triggers - triggers
if added:
logger.info("Triggers added: %s", added)
if removed:
logger.info("Triggers removed: %s", removed)
self.active_triggers = triggers
logger.info("Active triggers: %d", len(self.active_triggers))
def add_trigger(self, trigger_id: str) -> None:
"""Add a single trigger."""
if trigger_id not in self.active_triggers:
self.active_triggers.add(trigger_id)
logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers))
def remove_trigger(self, trigger_id: str) -> None:
"""Remove a single trigger."""
if trigger_id in self.active_triggers:
self.active_triggers.remove(trigger_id)
logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers))
def is_idle(self) -> bool:
"""
Check if container is idle and should shut down.
Returns:
True if no triggers and idle timeout exceeded
"""
has_triggers = len(self.active_triggers) > 0
idle_time = datetime.now() - self.last_activity
is_past_timeout = idle_time > self.idle_timeout
if has_triggers:
logger.debug("Not idle: has %d active triggers", len(self.active_triggers))
return False
if not is_past_timeout:
logger.debug(
"Not idle: last activity %s ago (timeout: %s)",
idle_time,
self.idle_timeout,
)
return False
logger.info(
"Container is idle: no triggers and %s since last activity", idle_time
)
return True
async def start(self) -> None:
"""Start the lifecycle manager background task."""
if self._running:
logger.warning("Lifecycle manager already running")
return
self._running = True
self._check_task = asyncio.create_task(self._check_loop())
logger.info("Lifecycle manager started")
async def stop(self) -> None:
"""Stop the lifecycle manager."""
self._running = False
if self._check_task:
self._check_task.cancel()
try:
await self._check_task
except asyncio.CancelledError:
pass
logger.info("Lifecycle manager stopped")
async def _check_loop(self) -> None:
"""Background task that periodically checks if container should shut down."""
while self._running:
try:
await asyncio.sleep(self.check_interval)
if self.is_idle():
if self.enable_shutdown:
logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN)
self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN)
# Give sidecar a moment to see the exit code file
await asyncio.sleep(1)
# Exit with special code
os._exit(EXIT_CODE_IDLE_SHUTDOWN)
else:
logger.info(
"Container is idle but shutdown is disabled (testing mode)"
)
except asyncio.CancelledError:
logger.info("Check loop cancelled")
raise
except Exception as e:
logger.error("Error in lifecycle check loop: %s", e, exc_info=True)
def _write_exit_code(self, code: int) -> None:
"""Write exit code to shared file for sidecar to read."""
try:
EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True)
EXIT_CODE_FILE.write_text(str(code))
logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE)
except Exception as e:
logger.warning("Failed to write exit code file: %s", e)
def setup_signal_handlers(self) -> None:
"""
Setup signal handlers for graceful shutdown.
On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart.
"""
def signal_handler(signum, frame):
logger.info("Received signal %d, exiting normally", signum)
sys.exit(0)
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
# Singleton instance for easy access across the application
_lifecycle_manager: Optional[LifecycleManager] = None
def get_lifecycle_manager(
idle_timeout_minutes: Optional[int] = None,
enable_shutdown: Optional[bool] = None,
) -> LifecycleManager:
"""Get or create the global lifecycle manager instance."""
global _lifecycle_manager
if _lifecycle_manager is None:
# Load configuration from environment or use provided values
idle_timeout = idle_timeout_minutes if idle_timeout_minutes is not None else int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15"))
check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60"))
enable = enable_shutdown if enable_shutdown is not None else os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
_lifecycle_manager = LifecycleManager(
idle_timeout_minutes=idle_timeout,
check_interval_seconds=check_interval,
enable_shutdown=enable,
)
return _lifecycle_manager
async def start_lifecycle_manager(
user_id: Optional[str] = None,
idle_timeout_minutes: Optional[int] = None,
enable_idle_shutdown: Optional[bool] = None,
) -> LifecycleManager:
"""Initialize and start the lifecycle manager."""
manager = get_lifecycle_manager(
idle_timeout_minutes=idle_timeout_minutes,
enable_shutdown=enable_idle_shutdown,
)
manager.setup_signal_handlers()
await manager.start()
return manager