Files
ai/sandbox/dexorder/memory_guard.py

86 lines
2.9 KiB
Python

"""
Memory guard for sandbox containers.
Sets a soft RLIMIT_AS limit derived from the cgroup memory limit at a
configurable fraction, so Python raises MemoryError before the kernel's
OOM killer fires. The MCP session survives; only the tool call fails.
"""
import gc
import logging
import resource
from pathlib import Path
log = logging.getLogger(__name__)
def _read_cgroup_limit_bytes() -> int | None:
"""Read container memory.max from cgroup v2. Returns bytes or None."""
try:
val = Path("/sys/fs/cgroup/memory.max").read_text().strip()
if val == "max":
return None
return int(val)
except Exception:
return None
def setup_memory_limit(fraction: float) -> None:
"""
Set RLIMIT_AS soft limit to baseline VmSize + allowed growth.
RLIMIT_AS caps total virtual address space, which includes shared libraries
and memory-mapped files that don't consume physical RAM. The baseline VmSize
at startup can be 3+ GB even when RSS is only ~200 MB. Setting the limit to
a flat cgroup fraction would crash immediately.
Instead: limit = current VmSize + (cgroup_limit * fraction)
This allows `fraction` worth of new allocations (numpy arrays, pandas
dataframes, etc.) above the startup baseline before raising MemoryError.
Args:
fraction: Proportion of cgroup memory.max to allow as new growth, e.g. 0.85.
"""
cgroup_bytes = _read_cgroup_limit_bytes()
# Read baseline VmSize (total virtual address space at startup)
vmsize_bytes: int | None = None
try:
for line in Path("/proc/self/status").read_text().splitlines():
if line.startswith("VmSize:"):
vmsize_bytes = int(line.split()[1]) * 1024 # kB → bytes
log.info("Memory baseline: %s", line.strip())
elif line.startswith("VmRSS:"):
log.info("Memory baseline: %s", line.strip())
except Exception:
pass
if cgroup_bytes is None:
log.warning("cgroup memory.max is unlimited; RLIMIT_AS not set")
return
allowed_growth_bytes = int(cgroup_bytes * fraction)
baseline = vmsize_bytes or 0
limit_bytes = baseline + allowed_growth_bytes
_, hard = resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, hard))
log.info(
"RLIMIT_AS soft limit set to %d MB (baseline %d MB + allowed growth %d MB, %.0f%% of cgroup %d MB)",
limit_bytes // (1024 * 1024),
baseline // (1024 * 1024),
allowed_growth_bytes // (1024 * 1024),
fraction * 100,
cgroup_bytes // (1024 * 1024),
)
def cleanup_memory() -> None:
"""
Called after a MemoryError is caught in a tool execution thread.
Runs gc.collect() to free objects held by the failed script.
Hook here for future recovery strategies (cache eviction, etc.).
"""
log.warning("MemoryError in tool thread — running gc.collect()")
gc.collect()