241 lines
8.4 KiB
Python
241 lines
8.4 KiB
Python
"""
|
|
Container lifecycle manager for agent containers.
|
|
|
|
Tracks activity and triggers to determine when the container should shut down.
|
|
Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import signal
|
|
import sys
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Optional, Set
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Exit code to signal clean idle shutdown to sidecar
|
|
EXIT_CODE_IDLE_SHUTDOWN = 42
|
|
|
|
# File to write exit code for sidecar to read
|
|
EXIT_CODE_FILE = Path("/var/run/agent/exit_code")
|
|
|
|
|
|
class LifecycleManager:
|
|
"""
|
|
Manages container lifecycle based on activity and triggers.
|
|
|
|
The container shuts itself down when:
|
|
1. No active triggers (data subscriptions, CEP patterns, etc.)
|
|
2. No recent user activity (MCP calls)
|
|
3. Idle timeout has elapsed
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
idle_timeout_minutes: int = 15,
|
|
check_interval_seconds: int = 60,
|
|
enable_shutdown: bool = True,
|
|
):
|
|
"""
|
|
Initialize lifecycle manager.
|
|
|
|
Args:
|
|
idle_timeout_minutes: Minutes of inactivity before shutdown
|
|
check_interval_seconds: Interval between idle checks
|
|
enable_shutdown: If False, only log idle state without exiting (for testing)
|
|
"""
|
|
self.idle_timeout = timedelta(minutes=idle_timeout_minutes)
|
|
self.check_interval = check_interval_seconds
|
|
self.enable_shutdown = enable_shutdown
|
|
|
|
self.last_activity: datetime = datetime.now()
|
|
self.active_triggers: Set[str] = set()
|
|
self._running = False
|
|
self._check_task: Optional[asyncio.Task] = None
|
|
|
|
logger.info(
|
|
"Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s",
|
|
idle_timeout_minutes,
|
|
check_interval_seconds,
|
|
enable_shutdown,
|
|
)
|
|
|
|
def record_activity(self) -> None:
|
|
"""
|
|
Record user activity (called on MCP tool/resource/prompt invocations).
|
|
Resets the idle timer.
|
|
"""
|
|
self.last_activity = datetime.now()
|
|
logger.debug("Activity recorded, idle timer reset")
|
|
|
|
def update_triggers(self, triggers: Set[str]) -> None:
|
|
"""
|
|
Update the set of active triggers.
|
|
|
|
Args:
|
|
triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.)
|
|
"""
|
|
if triggers != self.active_triggers:
|
|
added = triggers - self.active_triggers
|
|
removed = self.active_triggers - triggers
|
|
|
|
if added:
|
|
logger.info("Triggers added: %s", added)
|
|
if removed:
|
|
logger.info("Triggers removed: %s", removed)
|
|
|
|
self.active_triggers = triggers
|
|
logger.info("Active triggers: %d", len(self.active_triggers))
|
|
|
|
def add_trigger(self, trigger_id: str) -> None:
|
|
"""Add a single trigger."""
|
|
if trigger_id not in self.active_triggers:
|
|
self.active_triggers.add(trigger_id)
|
|
logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers))
|
|
|
|
def remove_trigger(self, trigger_id: str) -> None:
|
|
"""Remove a single trigger."""
|
|
if trigger_id in self.active_triggers:
|
|
self.active_triggers.remove(trigger_id)
|
|
logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers))
|
|
|
|
def is_idle(self) -> bool:
|
|
"""
|
|
Check if container is idle and should shut down.
|
|
|
|
Returns:
|
|
True if no triggers and idle timeout exceeded
|
|
"""
|
|
has_triggers = len(self.active_triggers) > 0
|
|
idle_time = datetime.now() - self.last_activity
|
|
is_past_timeout = idle_time > self.idle_timeout
|
|
|
|
if has_triggers:
|
|
logger.debug("Not idle: has %d active triggers", len(self.active_triggers))
|
|
return False
|
|
|
|
if not is_past_timeout:
|
|
logger.debug(
|
|
"Not idle: last activity %s ago (timeout: %s)",
|
|
idle_time,
|
|
self.idle_timeout,
|
|
)
|
|
return False
|
|
|
|
logger.info(
|
|
"Container is idle: no triggers and %s since last activity", idle_time
|
|
)
|
|
return True
|
|
|
|
async def start(self) -> None:
|
|
"""Start the lifecycle manager background task."""
|
|
if self._running:
|
|
logger.warning("Lifecycle manager already running")
|
|
return
|
|
|
|
self._running = True
|
|
self._check_task = asyncio.create_task(self._check_loop())
|
|
logger.info("Lifecycle manager started")
|
|
|
|
async def stop(self) -> None:
|
|
"""Stop the lifecycle manager."""
|
|
self._running = False
|
|
if self._check_task:
|
|
self._check_task.cancel()
|
|
try:
|
|
await self._check_task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
logger.info("Lifecycle manager stopped")
|
|
|
|
async def _check_loop(self) -> None:
|
|
"""Background task that periodically checks if container should shut down."""
|
|
while self._running:
|
|
try:
|
|
await asyncio.sleep(self.check_interval)
|
|
|
|
if self.is_idle():
|
|
if self.enable_shutdown:
|
|
logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN)
|
|
self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN)
|
|
|
|
# Give sidecar a moment to see the exit code file
|
|
await asyncio.sleep(1)
|
|
|
|
# Exit with special code
|
|
os._exit(EXIT_CODE_IDLE_SHUTDOWN)
|
|
else:
|
|
logger.info(
|
|
"Container is idle but shutdown is disabled (testing mode)"
|
|
)
|
|
|
|
except asyncio.CancelledError:
|
|
logger.info("Check loop cancelled")
|
|
raise
|
|
except Exception as e:
|
|
logger.error("Error in lifecycle check loop: %s", e, exc_info=True)
|
|
|
|
def _write_exit_code(self, code: int) -> None:
|
|
"""Write exit code to shared file for sidecar to read."""
|
|
try:
|
|
EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
EXIT_CODE_FILE.write_text(str(code))
|
|
logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE)
|
|
except Exception as e:
|
|
logger.warning("Failed to write exit code file: %s", e)
|
|
|
|
def setup_signal_handlers(self) -> None:
|
|
"""
|
|
Setup signal handlers for graceful shutdown.
|
|
On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart.
|
|
"""
|
|
|
|
def signal_handler(signum, frame):
|
|
logger.info("Received signal %d, exiting normally", signum)
|
|
sys.exit(0)
|
|
|
|
signal.signal(signal.SIGTERM, signal_handler)
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
|
|
# Singleton instance for easy access across the application
|
|
_lifecycle_manager: Optional[LifecycleManager] = None
|
|
|
|
|
|
def get_lifecycle_manager(
|
|
idle_timeout_minutes: Optional[int] = None,
|
|
enable_shutdown: Optional[bool] = None,
|
|
) -> LifecycleManager:
|
|
"""Get or create the global lifecycle manager instance."""
|
|
global _lifecycle_manager
|
|
if _lifecycle_manager is None:
|
|
# Load configuration from environment or use provided values
|
|
idle_timeout = idle_timeout_minutes if idle_timeout_minutes is not None else int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15"))
|
|
check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60"))
|
|
enable = enable_shutdown if enable_shutdown is not None else os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
|
|
|
|
_lifecycle_manager = LifecycleManager(
|
|
idle_timeout_minutes=idle_timeout,
|
|
check_interval_seconds=check_interval,
|
|
enable_shutdown=enable,
|
|
)
|
|
return _lifecycle_manager
|
|
|
|
|
|
async def start_lifecycle_manager(
|
|
user_id: Optional[str] = None,
|
|
idle_timeout_minutes: Optional[int] = None,
|
|
enable_idle_shutdown: Optional[bool] = None,
|
|
) -> LifecycleManager:
|
|
"""Initialize and start the lifecycle manager."""
|
|
manager = get_lifecycle_manager(
|
|
idle_timeout_minutes=idle_timeout_minutes,
|
|
enable_shutdown=enable_idle_shutdown,
|
|
)
|
|
manager.setup_signal_handlers()
|
|
await manager.start()
|
|
return manager
|