86 lines
2.9 KiB
Python
86 lines
2.9 KiB
Python
"""
|
|
Memory guard for sandbox containers.
|
|
|
|
Sets a soft RLIMIT_AS limit derived from the cgroup memory limit at a
|
|
configurable fraction, so Python raises MemoryError before the kernel's
|
|
OOM killer fires. The MCP session survives; only the tool call fails.
|
|
"""
|
|
|
|
import gc
|
|
import logging
|
|
import resource
|
|
from pathlib import Path
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def _read_cgroup_limit_bytes() -> int | None:
|
|
"""Read container memory.max from cgroup v2. Returns bytes or None."""
|
|
try:
|
|
val = Path("/sys/fs/cgroup/memory.max").read_text().strip()
|
|
if val == "max":
|
|
return None
|
|
return int(val)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def setup_memory_limit(fraction: float) -> None:
|
|
"""
|
|
Set RLIMIT_AS soft limit to baseline VmSize + allowed growth.
|
|
|
|
RLIMIT_AS caps total virtual address space, which includes shared libraries
|
|
and memory-mapped files that don't consume physical RAM. The baseline VmSize
|
|
at startup can be 3+ GB even when RSS is only ~200 MB. Setting the limit to
|
|
a flat cgroup fraction would crash immediately.
|
|
|
|
Instead: limit = current VmSize + (cgroup_limit * fraction)
|
|
This allows `fraction` worth of new allocations (numpy arrays, pandas
|
|
dataframes, etc.) above the startup baseline before raising MemoryError.
|
|
|
|
Args:
|
|
fraction: Proportion of cgroup memory.max to allow as new growth, e.g. 0.85.
|
|
"""
|
|
cgroup_bytes = _read_cgroup_limit_bytes()
|
|
|
|
# Read baseline VmSize (total virtual address space at startup)
|
|
vmsize_bytes: int | None = None
|
|
try:
|
|
for line in Path("/proc/self/status").read_text().splitlines():
|
|
if line.startswith("VmSize:"):
|
|
vmsize_bytes = int(line.split()[1]) * 1024 # kB → bytes
|
|
log.info("Memory baseline: %s", line.strip())
|
|
elif line.startswith("VmRSS:"):
|
|
log.info("Memory baseline: %s", line.strip())
|
|
except Exception:
|
|
pass
|
|
|
|
if cgroup_bytes is None:
|
|
log.warning("cgroup memory.max is unlimited; RLIMIT_AS not set")
|
|
return
|
|
|
|
allowed_growth_bytes = int(cgroup_bytes * fraction)
|
|
baseline = vmsize_bytes or 0
|
|
limit_bytes = baseline + allowed_growth_bytes
|
|
|
|
_, hard = resource.getrlimit(resource.RLIMIT_AS)
|
|
resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, hard))
|
|
log.info(
|
|
"RLIMIT_AS soft limit set to %d MB (baseline %d MB + allowed growth %d MB, %.0f%% of cgroup %d MB)",
|
|
limit_bytes // (1024 * 1024),
|
|
baseline // (1024 * 1024),
|
|
allowed_growth_bytes // (1024 * 1024),
|
|
fraction * 100,
|
|
cgroup_bytes // (1024 * 1024),
|
|
)
|
|
|
|
|
|
def cleanup_memory() -> None:
|
|
"""
|
|
Called after a MemoryError is caught in a tool execution thread.
|
|
Runs gc.collect() to free objects held by the failed script.
|
|
Hook here for future recovery strategies (cache eviction, etc.).
|
|
"""
|
|
log.warning("MemoryError in tool thread — running gc.collect()")
|
|
gc.collect()
|