bugfixes; research subproc; higher sandbox limits

2026-04-16 18:11:26 -04:00
parent f80c943dc3
commit 3153e89d4f
54 changed files with 1947 additions and 498 deletions
--- a/sandbox/dexorder/conda_manager.py
+++ b/sandbox/dexorder/conda_manager.py
@@ -510,3 +510,44 @@ def sync_packages(data_dir: Path, environment_yml: Optional[Path] = None) -> dic
    log.info(f"Conda package sync complete: {len(result['removed'])} packages removed")

    return result
+
+
+# =============================================================================
+# Async wrappers — non-blocking equivalents for use from asyncio contexts
+# =============================================================================
+
+import asyncio as _asyncio
+
+
+async def get_installed_packages_async() -> Set[str]:
+    """Non-blocking wrapper around get_installed_packages()."""
+    return await _asyncio.to_thread(get_installed_packages)
+
+
+async def install_packages_async(
+    packages: list[str],
+    data_dir: Optional[Path] = None,
+) -> dict:
+    """Non-blocking wrapper around install_packages()."""
+    return await _asyncio.to_thread(install_packages, packages, data_dir)
+
+
+async def remove_packages_async(packages: list[str]) -> dict:
+    """Non-blocking wrapper around remove_packages()."""
+    return await _asyncio.to_thread(remove_packages, packages)
+
+
+async def cleanup_extra_packages_async(
+    data_dir: Path,
+    environment_yml: Optional[Path] = None,
+) -> dict:
+    """Non-blocking wrapper around cleanup_extra_packages()."""
+    return await _asyncio.to_thread(cleanup_extra_packages, data_dir, environment_yml)
+
+
+async def sync_packages_async(
+    data_dir: Path,
+    environment_yml: Optional[Path] = None,
+) -> dict:
+    """Non-blocking wrapper around sync_packages()."""
+    return await _asyncio.to_thread(sync_packages, data_dir, environment_yml)
--- a/sandbox/dexorder/event_loop.py
+++ b/sandbox/dexorder/event_loop.py
@@ -0,0 +1,54 @@
+"""
+Thread-safe asyncio.run() for the sandbox.
+
+Installs a global replacement for asyncio.run() that, when called from a
+non-async thread while uvicorn's event loop is running, dispatches the
+coroutine to that loop via run_coroutine_threadsafe(). The calling thread
+blocks on future.result() — releasing the GIL — so uvicorn's loop runs
+freely (health checks, MCP requests, etc.).
+
+Usage:
+    from dexorder.event_loop import install_thread_safe_asyncio_run
+    install_thread_safe_asyncio_run(asyncio.get_running_loop())  # call once at startup
+"""
+
+import asyncio
+import logging
+
+log = logging.getLogger(__name__)
+
+_main_loop: asyncio.AbstractEventLoop | None = None
+_original_asyncio_run = asyncio.run
+
+
+def install_thread_safe_asyncio_run(loop: asyncio.AbstractEventLoop) -> None:
+    """
+    Patch asyncio.run globally to cooperate with uvicorn's event loop.
+    Call once from the lifespan startup (main thread, loop already running).
+    """
+    global _main_loop
+    _main_loop = loop
+
+    def _thread_safe_run(coro, *, debug=None):
+        # Detect if we're in a thread (no running loop in this thread)
+        try:
+            asyncio.get_running_loop()
+            # We're already inside an async context — asyncio.run() is not
+            # valid here regardless; let it raise the normal error.
+            raise RuntimeError(
+                "asyncio.run() cannot be called when another event loop is running "
+                "in the same thread."
+            )
+        except RuntimeError as exc:
+            if "cannot be called" in str(exc):
+                raise
+            # No running loop in this thread — safe to dispatch to main loop.
+            if _main_loop is not None and _main_loop.is_running():
+                log.debug("asyncio.run() from thread → run_coroutine_threadsafe")
+                return asyncio.run_coroutine_threadsafe(coro, _main_loop).result()
+
+        # Fallback: main loop not available (e.g., called before startup or in tests)
+        return _original_asyncio_run(coro, debug=debug)
+
+    asyncio.run = _thread_safe_run
+    log.info("Installed thread-safe asyncio.run()")
--- a/sandbox/dexorder/iceberg_client.py
+++ b/sandbox/dexorder/iceberg_client.py
@@ -5,6 +5,8 @@ Tickers use Nautilus format: "BTC/USDT.BINANCE"
 All timestamps are nanoseconds since epoch.
 """

+import tracemalloc
+from pathlib import Path
 from typing import Optional, List, Tuple
 import pandas as pd
 import logging
@@ -19,6 +21,19 @@ from pyiceberg.expressions import (
 log = logging.getLogger(__name__)


+def _rss_mb() -> str:
+    """Return current VmRSS and VmPeak from /proc/self/status as a short string."""
+    try:
+        info = {}
+        for line in Path("/proc/self/status").read_text().splitlines():
+            for key in ("VmRSS", "VmPeak", "VmSize"):
+                if line.startswith(f"{key}:"):
+                    info[key] = int(line.split()[1]) // 1024  # kB → MB
+        return f"RSS={info.get('VmRSS','?')}MB peak={info.get('VmPeak','?')}MB virt={info.get('VmSize','?')}MB"
+    except Exception:
+        return "?"
+
+
 class IcebergClient:
    """
    Client for querying OHLC data from Iceberg warehouse (Iceberg 1.10.1).
@@ -114,8 +129,21 @@ class IcebergClient:
        if fetch_columns is not None:
            scan = scan.select(*fetch_columns)

+        if not tracemalloc.is_tracing():
+            tracemalloc.start()
+        tm_before = tracemalloc.take_snapshot()
+        log.info("MEM before scan.to_pandas(): %s", _rss_mb())
+
        df = scan.to_pandas()

+        log.info("MEM after scan.to_pandas(): %s | rows=%d cols=%s mem=%dMB",
+                 _rss_mb(), len(df), list(df.columns),
+                 df.memory_usage(deep=True).sum() // (1024 * 1024))
+        tm_after = tracemalloc.take_snapshot()
+        top = tm_after.compare_to(tm_before, "lineno")
+        for stat in top[:5]:
+            log.info("TRACEMALLOC: %s", stat)
+
        if not df.empty:
            # Deduplicate: keep the most-recently-ingested row per timestamp.
            if "ingested_at" in df.columns:
@@ -123,6 +151,7 @@ class IcebergClient:
                    df.sort_values("ingested_at", ascending=False)
                      .drop_duplicates(subset=["timestamp"])
                )
+            log.info("MEM after dedup: %s | rows=%d", _rss_mb(), len(df))
            # Drop ingested_at if the caller did not ask for it
            if columns is not None and "ingested_at" not in columns and "ingested_at" in df.columns:
                df = df.drop(columns=["ingested_at"])
--- a/sandbox/dexorder/memory_guard.py
+++ b/sandbox/dexorder/memory_guard.py
@@ -0,0 +1,85 @@
+"""
+Memory guard for sandbox containers.
+
+Sets a soft RLIMIT_AS limit derived from the cgroup memory limit at a
+configurable fraction, so Python raises MemoryError before the kernel's
+OOM killer fires. The MCP session survives; only the tool call fails.
+"""
+
+import gc
+import logging
+import resource
+from pathlib import Path
+
+log = logging.getLogger(__name__)
+
+
+def _read_cgroup_limit_bytes() -> int | None:
+    """Read container memory.max from cgroup v2. Returns bytes or None."""
+    try:
+        val = Path("/sys/fs/cgroup/memory.max").read_text().strip()
+        if val == "max":
+            return None
+        return int(val)
+    except Exception:
+        return None
+
+
+def setup_memory_limit(fraction: float) -> None:
+    """
+    Set RLIMIT_AS soft limit to baseline VmSize + allowed growth.
+
+    RLIMIT_AS caps total virtual address space, which includes shared libraries
+    and memory-mapped files that don't consume physical RAM. The baseline VmSize
+    at startup can be 3+ GB even when RSS is only ~200 MB. Setting the limit to
+    a flat cgroup fraction would crash immediately.
+
+    Instead: limit = current VmSize + (cgroup_limit * fraction)
+    This allows `fraction` worth of new allocations (numpy arrays, pandas
+    dataframes, etc.) above the startup baseline before raising MemoryError.
+
+    Args:
+        fraction: Proportion of cgroup memory.max to allow as new growth, e.g. 0.85.
+    """
+    cgroup_bytes = _read_cgroup_limit_bytes()
+
+    # Read baseline VmSize (total virtual address space at startup)
+    vmsize_bytes: int | None = None
+    try:
+        for line in Path("/proc/self/status").read_text().splitlines():
+            if line.startswith("VmSize:"):
+                vmsize_bytes = int(line.split()[1]) * 1024  # kB → bytes
+                log.info("Memory baseline: %s", line.strip())
+            elif line.startswith("VmRSS:"):
+                log.info("Memory baseline: %s", line.strip())
+    except Exception:
+        pass
+
+    if cgroup_bytes is None:
+        log.warning("cgroup memory.max is unlimited; RLIMIT_AS not set")
+        return
+
+    allowed_growth_bytes = int(cgroup_bytes * fraction)
+    baseline = vmsize_bytes or 0
+    limit_bytes = baseline + allowed_growth_bytes
+
+    _, hard = resource.getrlimit(resource.RLIMIT_AS)
+    resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, hard))
+    log.info(
+        "RLIMIT_AS soft limit set to %d MB (baseline %d MB + allowed growth %d MB, %.0f%% of cgroup %d MB)",
+        limit_bytes // (1024 * 1024),
+        baseline // (1024 * 1024),
+        allowed_growth_bytes // (1024 * 1024),
+        fraction * 100,
+        cgroup_bytes // (1024 * 1024),
+    )
+
+
+def cleanup_memory() -> None:
+    """
+    Called after a MemoryError is caught in a tool execution thread.
+    Runs gc.collect() to free objects held by the failed script.
+    Hook here for future recovery strategies (cache eviction, etc.).
+    """
+    log.warning("MemoryError in tool thread — running gc.collect()")
+    gc.collect()
--- a/sandbox/dexorder/tools/backtest_harness.py
+++ b/sandbox/dexorder/tools/backtest_harness.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""
+backtest_harness — runs a strategy backtest as a subprocess.
+
+Reads a JSON config from stdin:
+{
+    "strategy_name": str,
+    "feeds": [{"symbol": str, "period_seconds": int}, ...],
+    "from_time": ...,
+    "to_time": ...,
+    "initial_capital": float,
+    "paper": bool
+}
+
+Outputs JSON to stdout on success:
+{
+    "strategy_name": str,
+    "feeds": [...],
+    "initial_capital": float,
+    "paper": bool,
+    "total_candles": int,
+    ... (metrics from run_backtest)
+}
+
+On error:
+{"error": str}
+"""
+
+import asyncio
+import json
+import os
+import sys
+import traceback
+from pathlib import Path
+
+# Ensure dexorder package is importable when run as a subprocess
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+_OHLC_EXTRA_COLUMNS = [
+    "volume", "buy_vol", "sell_vol",
+    "open_time", "high_time", "low_time", "close_time",
+    "open_interest",
+]
+
+
+async def _run(cfg: dict) -> dict:
+    strategy_name = cfg["strategy_name"]
+    feeds = cfg["feeds"]
+    from_time = cfg.get("from_time")
+    to_time = cfg.get("to_time")
+    initial_capital = float(cfg.get("initial_capital", 10_000.0))
+    paper = bool(cfg.get("paper", True))
+
+    # -------------------------------------------------------------------------
+    # Initialize API
+    # -------------------------------------------------------------------------
+    try:
+        import yaml
+
+        config_path = os.environ.get("CONFIG_PATH", "/app/config/config.yaml")
+        secrets_path = os.environ.get("SECRETS_PATH", "/app/config/secrets.yaml")
+
+        config_data = {}
+        secrets_data = {}
+        if Path(config_path).exists():
+            with open(config_path) as f:
+                config_data = yaml.safe_load(f) or {}
+        if Path(secrets_path).exists():
+            with open(secrets_path) as f:
+                secrets_data = yaml.safe_load(f) or {}
+
+        data_cfg = config_data.get("data", {})
+        iceberg_cfg = data_cfg.get("iceberg", {})
+        relay_cfg = data_cfg.get("relay", {})
+
+        from dexorder.api import set_api, API
+        from dexorder.impl.charting_api_impl import ChartingAPIImpl
+        from dexorder.impl.data_api_impl import DataAPIImpl
+
+        data_api = DataAPIImpl(
+            iceberg_catalog_uri=iceberg_cfg.get("catalog_uri", "http://iceberg-catalog:8181"),
+            relay_endpoint=relay_cfg.get("endpoint", "tcp://relay:5559"),
+            notification_endpoint=relay_cfg.get("notification_endpoint", "tcp://relay:5558"),
+            namespace=iceberg_cfg.get("namespace", "trading"),
+            s3_endpoint=iceberg_cfg.get("s3_endpoint") or secrets_data.get("s3_endpoint"),
+            s3_access_key=iceberg_cfg.get("s3_access_key") or secrets_data.get("s3_access_key"),
+            s3_secret_key=iceberg_cfg.get("s3_secret_key") or secrets_data.get("s3_secret_key"),
+            s3_region=iceberg_cfg.get("s3_region") or secrets_data.get("s3_region"),
+            request_timeout=240.0,
+        )
+        set_api(API(charting=ChartingAPIImpl(), data=data_api))
+    except Exception as e:
+        return {"error": f"API initialization failed: {e}"}
+
+    # -------------------------------------------------------------------------
+    # Locate strategy
+    # -------------------------------------------------------------------------
+    data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
+    try:
+        from dexorder.tools.python_tools import get_category_manager, sanitize_name
+        category_manager = get_category_manager(data_dir)
+        safe_name = sanitize_name(strategy_name)
+        impl_path = category_manager.src_dir / "strategy" / safe_name / "implementation.py"
+        if not impl_path.exists():
+            return {"error": f"Strategy '{strategy_name}' not found (looked at {impl_path})"}
+    except Exception as exc:
+        return {"error": f"Failed to locate strategy: {exc}"}
+
+    # -------------------------------------------------------------------------
+    # Register custom indicators and load strategy class
+    # -------------------------------------------------------------------------
+    try:
+        from dexorder.nautilus.backtest_runner import _setup_custom_indicators
+        _setup_custom_indicators(category_manager.src_dir)
+    except Exception as exc:
+        sys.stderr.write(f"WARNING: custom indicator setup failed: {exc}\n")
+
+    try:
+        from dexorder.nautilus.backtest_runner import _load_strategy_class
+        strategy_class = _load_strategy_class(impl_path)
+    except Exception:
+        return {"error": f"Strategy load failed:\n{traceback.format_exc()}"}
+
+    # -------------------------------------------------------------------------
+    # Fetch OHLC data
+    # -------------------------------------------------------------------------
+    from dexorder.api import get_api
+    from dexorder.nautilus.pandas_strategy import make_feed_key
+
+    api = get_api()
+    parsed_feeds = [(f["symbol"], int(f["period_seconds"])) for f in feeds]
+    ohlc_dfs = {}
+    total_candles = 0
+
+    for ticker, period_seconds in parsed_feeds:
+        feed_key = make_feed_key(ticker, period_seconds)
+        try:
+            df = await api.data.historical_ohlc(
+                ticker=ticker,
+                period_seconds=period_seconds,
+                start_time=from_time,
+                end_time=to_time,
+                extra_columns=_OHLC_EXTRA_COLUMNS,
+            )
+        except Exception as exc:
+            return {"error": f"OHLC fetch failed for {feed_key}: {exc}"}
+
+        if df.empty:
+            return {"error": f"No OHLC data for {feed_key} in the requested range"}
+
+        ohlc_dfs[feed_key] = df
+        total_candles += len(df)
+
+    # -------------------------------------------------------------------------
+    # Run backtest (synchronous)
+    # -------------------------------------------------------------------------
+    try:
+        from dexorder.nautilus.backtest_runner import run_backtest
+        metrics = run_backtest(
+            strategy_class=strategy_class,
+            feeds=parsed_feeds,
+            ohlc_dfs=ohlc_dfs,
+            initial_capital=initial_capital,
+            paper=paper,
+        )
+    except Exception:
+        return {"error": f"Backtest failed:\n{traceback.format_exc()}"}
+
+    return {
+        "strategy_name": strategy_name,
+        "feeds": [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
+        "initial_capital": initial_capital,
+        "paper": paper,
+        "total_candles": total_candles,
+        **metrics,
+    }
+
+
+def main():
+    cfg = json.loads(sys.stdin.read())
+    result = asyncio.run(_run(cfg))
+    print(json.dumps(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/sandbox/dexorder/tools/backtest_strategy.py
+++ b/sandbox/dexorder/tools/backtest_strategy.py
@@ -1,25 +1,21 @@
 """
 backtest_strategy — run a PandasStrategy against historical OHLC data.

-Called directly from the MCP server's async handle_tool_call.
-
-Returns a JSON payload with backtest metrics and equity curve, following the
-same pattern as evaluate_indicator.py.
+Spawns backtest_harness.py as a subprocess so user strategy code is isolated
+from the MCP server process. The harness handles API init, data fetch, and
+the synchronous BacktestEngine internally.
 """

+import asyncio
 import json
 import logging
+import sys
 from pathlib import Path
 from typing import Any

 log = logging.getLogger(__name__)

-# All OHLC+ columns to request from the DataAPI
-_OHLC_EXTRA_COLUMNS = [
-    "volume", "buy_vol", "sell_vol",
-    "open_time", "high_time", "low_time", "close_time",
-    "open_interest",
-]
+_BACKTEST_HARNESS = Path(__file__).parent / "backtest_harness.py"


 async def backtest_strategy(
@@ -42,23 +38,8 @@ async def backtest_strategy(
        paper: Always True for historical backtest (flag reserved for forward testing)

    Returns:
-        list[TextContent] with JSON payload:
-        {
-            "strategy_name": str,
-            "feeds": [...],
-            "initial_capital": float,
-            "paper": bool,
-            "total_candles": int,
-            "total_return": float,       # fractional (0.15 = +15%)
-            "sharpe_ratio": float,
-            "max_drawdown": float,       # fractional (0.10 = 10% drawdown)
-            "win_rate": float,
-            "trade_count": int,
-            "equity_curve": [{"timestamp": int, "equity": float}, ...]
-        }
-
-        On error:
-        {"error": str}
+        list[TextContent] with JSON payload containing backtest metrics.
+        On error: [TextContent] with {"error": str}
    """
    from mcp.types import TextContent

@@ -66,102 +47,52 @@ async def backtest_strategy(
        log.error("backtest_strategy '%s': %s", strategy_name, msg)
        return [TextContent(type="text", text=json.dumps({"error": msg}))]

-    # --- 1. Validate feeds input ---
    if not feeds:
        return _err("feeds list is empty — provide at least one {symbol, period_seconds} entry")

-    parsed_feeds: list[tuple[str, int]] = []
    for f in feeds:
-        sym = f.get("symbol", "")
-        ps = f.get("period_seconds", 3600)
-        if not sym:
+        if not f.get("symbol"):
            return _err(f"Feed entry missing 'symbol': {f}")
-        parsed_feeds.append((sym, int(ps)))

-    # --- 2. Resolve strategy implementation file ---
-    try:
-        from dexorder.tools.python_tools import get_category_manager, sanitize_name
-        category_manager = get_category_manager()
-        safe_name = sanitize_name(strategy_name)
-        impl_path = category_manager.src_dir / "strategy" / safe_name / "implementation.py"
-        if not impl_path.exists():
-            return _err(f"Strategy '{strategy_name}' not found (looked at {impl_path})")
-    except Exception as exc:
-        return _err(f"Failed to locate strategy: {exc}")
-
-    # --- 3. Register custom indicators with pandas-ta ---
-    try:
-        from dexorder.nautilus.backtest_runner import _setup_custom_indicators
-        _setup_custom_indicators(category_manager.src_dir)
-    except Exception as exc:
-        log.warning("backtest_strategy: custom indicator setup failed: %s", exc)
-
-    # --- 4. Load strategy class ---
-    try:
-        from dexorder.nautilus.backtest_runner import _load_strategy_class
-        strategy_class = _load_strategy_class(impl_path)
-    except Exception as exc:
-        log.exception("backtest_strategy: strategy load failed")
-        return _err(f"Strategy load failed: {exc}")
-
-    # --- 5. Fetch OHLC+ data for each feed ---
-    try:
-        from dexorder.api import get_api
-        api = get_api()
-    except Exception as exc:
-        return _err(f"API not available: {exc}")
-
-    ohlc_dfs: dict[str, Any] = {}
-    total_candles = 0
-
-    for ticker, period_seconds in parsed_feeds:
-        from dexorder.nautilus.pandas_strategy import make_feed_key
-        feed_key = make_feed_key(ticker, period_seconds)
-        try:
-            df = await api.data.historical_ohlc(
-                ticker=ticker,
-                period_seconds=period_seconds,
-                start_time=from_time,
-                end_time=to_time,
-                extra_columns=_OHLC_EXTRA_COLUMNS,
-            )
-        except Exception as exc:
-            log.exception("backtest_strategy: OHLC fetch failed for %s", feed_key)
-            return _err(f"OHLC fetch failed for {feed_key}: {exc}")
-
-        if df.empty:
-            return _err(f"No OHLC data for {feed_key} in the requested range")
-
-        ohlc_dfs[feed_key] = df
-        total_candles += len(df)
-
-    # --- 6. Run backtest in thread executor (BacktestEngine is synchronous) ---
-    try:
-        import asyncio
-        from dexorder.nautilus.backtest_runner import run_backtest
-
-        loop = asyncio.get_event_loop()
-        metrics = await loop.run_in_executor(
-            None,
-            lambda: run_backtest(
-                strategy_class=strategy_class,
-                feeds=parsed_feeds,
-                ohlc_dfs=ohlc_dfs,
-                initial_capital=initial_capital,
-                paper=paper,
-            ),
-        )
-    except Exception as exc:
-        log.exception("backtest_strategy: backtest run failed")
-        return _err(f"Backtest failed: {exc}")
-
-    # --- 7. Return results ---
-    payload = {
-        "strategy_name":   strategy_name,
-        "feeds":           [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
+    cfg = {
+        "strategy_name": strategy_name,
+        "feeds": feeds,
+        "from_time": from_time,
+        "to_time": to_time,
        "initial_capital": initial_capital,
-        "paper":           paper,
-        "total_candles":   total_candles,
-        **metrics,  # keys: summary, statistics, trades, equity_curve
+        "paper": paper,
    }
+
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            sys.executable, str(_BACKTEST_HARNESS),
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await asyncio.wait_for(
+            proc.communicate(json.dumps(cfg).encode()),
+            timeout=600,
+        )
+    except asyncio.TimeoutError:
+        return _err("Backtest timed out (10 minutes)")
+    except Exception as exc:
+        return _err(f"Failed to launch backtest harness: {exc}")
+
+    if proc.returncode != 0:
+        err_text = stderr.decode(errors="replace")
+        log.error("backtest_strategy '%s': harness exited %d: %s", strategy_name, proc.returncode, err_text[:500])
+        return _err(f"Backtest harness failed:\n{err_text}")
+
+    if stderr:
+        log.warning("backtest_strategy '%s' stderr: %s", strategy_name, stderr.decode(errors="replace")[:500])
+
+    try:
+        payload = json.loads(stdout.decode())
+    except json.JSONDecodeError:
+        return _err(f"Harness produced invalid JSON: {stdout.decode(errors='replace')[:200]}")
+
+    if "error" in payload:
+        return _err(payload["error"])
+
    return [TextContent(type="text", text=json.dumps(payload))]
--- a/sandbox/dexorder/tools/python_tools.py
+++ b/sandbox/dexorder/tools/python_tools.py
@@ -18,51 +18,32 @@ After write/edit operations, a category-specific test harness runs to validate
 the code and capture errors/output for agent feedback.
 """

-import concurrent.futures
 import json
 import logging
 import re
 import subprocess
 import sys
-import traceback
 from dataclasses import dataclass, asdict
 from enum import Enum
 from pathlib import Path
 from typing import Any, Optional

+from dexorder.tools.subprocess_runner import run_subprocess_argv, run_in_thread
+
 log = logging.getLogger(__name__)

-
-def _run_inprocess(fn, *args, timeout: int) -> dict:
-    """
-    Run fn(*args) in a one-shot thread and return its result dict.
-
-    Uses a thread so the calling coroutine is not blocked and the calling
-    process does not fork a new Python interpreter. All already-loaded
-    libraries (numpy, pandas, matplotlib, etc.) are shared with the thread.
-
-    On timeout returns a dict with _timeout=True. On unexpected exception
-    returns a dict with error=True and the traceback in stderr.
-    """
-    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
-        future = executor.submit(fn, *args)
-        try:
-            return future.result(timeout=timeout)
-        except concurrent.futures.TimeoutError:
-            return {"_timeout": True, "error": True,
-                    "stdout": "", "stderr": "", "images": []}
-        except Exception:
-            return {"error": True, "stdout": "",
-                    "stderr": traceback.format_exc(), "images": []}
+# Paths to harness scripts run as subprocesses
+_RESEARCH_HARNESS = Path(__file__).parent / "research_harness.py"
+_STRATEGY_HARNESS = Path(__file__).parent / "strategy_harness.py"


 # Import conda manager for package installation and tracking
 try:
-    from dexorder.conda_manager import install_packages, cleanup_extra_packages
+    from dexorder.conda_manager import install_packages_async, cleanup_extra_packages_async
 except ImportError:
    log.warning("conda_manager not available - package installation disabled")
-    install_packages = None
-    cleanup_extra_packages = None
+    install_packages_async = None
+    cleanup_extra_packages_async = None


 # =============================================================================
@@ -355,6 +336,39 @@ class GitManager:
        except Exception:
            pass

+    # ------------------------------------------------------------------
+    # Async variants — delegates to sync methods via asyncio.to_thread
+    # so the event loop stays responsive during git operations.
+    # ------------------------------------------------------------------
+
+    async def commit_async(self, message: str) -> Optional[str]:
+        import asyncio
+        return await asyncio.to_thread(self.commit, message)
+
+    async def log_async(self, path: Optional[Path] = None, n: int = 20) -> list[dict]:
+        import asyncio
+        return await asyncio.to_thread(self.log, path, n)
+
+    async def restore_async(self, revision: str, path: Optional[Path] = None) -> Optional[str]:
+        import asyncio
+        return await asyncio.to_thread(self.restore, revision, path)
+
+    async def head_short_hash_async(self) -> str:
+        import asyncio
+        return await asyncio.to_thread(self.head_short_hash)
+
+    async def create_worktree_async(self, worktree_path: Path, revision: str = "HEAD") -> str:
+        import asyncio
+        return await asyncio.to_thread(self.create_worktree, worktree_path, revision)
+
+    async def remove_worktree_async(self, worktree_path: Path) -> None:
+        import asyncio
+        return await asyncio.to_thread(self.remove_worktree, worktree_path)
+
+    async def prune_worktrees_async(self) -> None:
+        import asyncio
+        return await asyncio.to_thread(self.prune_worktrees)
+

 # =============================================================================
 # Custom Indicator Setup
@@ -484,7 +498,7 @@ class CategoryFileManager:
        """Root of the versioned category code (git repo root)."""
        return self.data_dir / "src"

-    def write(
+    async def write(
        self,
        category: str,
        name: str,
@@ -547,7 +561,7 @@ class CategoryFileManager:
            return {"success": False, "error": f"Failed to write metadata: {e}"}

        # Run validation harness
-        validation = self._validate(cat, item_dir)
+        validation = await self._validate(cat, item_dir)

        result = {
            "success": validation["success"],
@@ -559,19 +573,19 @@ class CategoryFileManager:
        if validation["success"]:
            if cat == Category.RESEARCH:
                log.info(f"Auto-executing research script: {name}")
-                result["execution"] = self.execute_research(name)
+                result["execution"] = await self.execute_research(name)
            elif cat == Category.INDICATOR:
                log.info(f"Auto-executing indicator test: {name}")
-                result["execution"] = self._execute_indicator(item_dir)
+                result["execution"] = await self._execute_indicator(item_dir)

        # Commit to git
-        commit_hash = self.git.commit(f"create({category}): {name}")
+        commit_hash = await self.git.commit_async(f"create({category}): {name}")
        if commit_hash:
            result["revision"] = commit_hash

        return result

-    def edit(
+    async def edit(
        self,
        category: str,
        name: str,
@@ -671,7 +685,7 @@ class CategoryFileManager:
        # Run validation harness if code was updated
        validation = None
        if code is not None:
-            validation = self._validate(cat, item_dir)
+            validation = await self._validate(cat, item_dir)

        result = {
            "success": True,
@@ -685,15 +699,15 @@ class CategoryFileManager:
        if code is not None and result["success"]:
            if cat == Category.RESEARCH:
                log.info(f"Auto-executing research script after edit: {name}")
-                result["execution"] = self.execute_research(name)
+                result["execution"] = await self.execute_research(name)
            elif cat == Category.INDICATOR:
                log.info(f"Auto-executing indicator test after edit: {name}")
-                result["execution"] = self._execute_indicator(item_dir)
+                result["execution"] = await self._execute_indicator(item_dir)

        # Commit to git if code changed
        if code is not None and result["success"]:
            action = "patch" if patches is not None else "edit"
-            commit_hash = self.git.commit(f"{action}({category}): {name}")
+            commit_hash = await self.git.commit_async(f"{action}({category}): {name}")
            if commit_hash:
                result["revision"] = commit_hash

@@ -776,7 +790,7 @@ class CategoryFileManager:

        return {"items": items}

-    def _validate(self, category: Category, item_dir: Path) -> dict[str, Any]:
+    async def _validate(self, category: Category, item_dir: Path) -> dict[str, Any]:
        """
        Run category-specific validation harness.

@@ -793,13 +807,13 @@ class CategoryFileManager:

        # Install required packages before validation
        packages_installed = []
-        if install_packages and meta_path.exists():
+        if install_packages_async and meta_path.exists():
            try:
                metadata = json.loads(meta_path.read_text())
                conda_packages = metadata.get("conda_packages", [])
                if conda_packages:
                    log.info(f"Installing packages for validation: {conda_packages}")
-                    install_result = install_packages(conda_packages, data_dir=self.data_dir)
+                    install_result = await install_packages_async(conda_packages, data_dir=self.data_dir)
                    if install_result.get("success"):
                        packages_installed = install_result.get("installed", [])
                        if packages_installed:
@@ -811,11 +825,11 @@ class CategoryFileManager:

        # Run validation
        if category == Category.STRATEGY:
-            result = self._validate_strategy(impl_path)
+            result = await self._validate_strategy(impl_path)
        elif category == Category.INDICATOR:
-            result = self._validate_indicator(impl_path)
+            result = await self._validate_indicator(impl_path)
        elif category == Category.RESEARCH:
-            result = self._validate_research(impl_path, item_dir)
+            result = await self._validate_research(impl_path, item_dir)
        else:
            result = {"success": False, "error": f"No validator for category {category}"}

@@ -825,19 +839,18 @@ class CategoryFileManager:

        return result

-    def _validate_strategy(self, impl_path: Path) -> dict[str, Any]:
+    async def _validate_strategy(self, impl_path: Path) -> dict[str, Any]:
        """
        Validate a strategy by running it against synthetic OHLC data.

-        Runs strategy_harness.py in-process via a thread. Catches import errors,
+        Runs strategy_harness.py as a subprocess. Catches import errors,
        runtime errors in evaluate(), and wrong class hierarchy — not just syntax.
        """
-        meta_path = impl_path.parent / "metadata.json"
-        return self._execute_strategy(impl_path.parent, timeout=45)
+        return await self._execute_strategy(impl_path.parent, timeout=45)

-    def _execute_strategy(self, item_dir: Path, timeout: int = 45) -> dict[str, Any]:
+    async def _execute_strategy(self, item_dir: Path, timeout: int = 45) -> dict[str, Any]:
        """
-        Run a strategy against synthetic OHLC data in-process via a thread.
+        Run a strategy against synthetic OHLC data via strategy_harness.py subprocess.

        Returns:
            dict with success, output (human-readable summary), trade_count, error
@@ -850,24 +863,26 @@ class CategoryFileManager:
        if not meta_path.exists():
            return {"success": False, "error": "metadata.json not found"}

-        from dexorder.tools.strategy_harness import run as _strategy_run
-        result = _run_inprocess(_strategy_run, impl_path, meta_path, timeout=timeout)
-
-        if result.get("_timeout"):
+        data = await run_subprocess_argv(
+            sys.executable, str(_STRATEGY_HARNESS), str(impl_path), str(meta_path),
+            timeout=timeout,
+        )
+        if data.get("_timeout"):
            return {"success": False, "error": f"Strategy test timed out after {timeout}s"}
-        return result
+        if data.get("error") and not data.get("success"):
+            return {"success": False, "error": data.get("stderr") or "Harness failed"}
+        return data

-    def _validate_indicator(self, impl_path: Path) -> dict[str, Any]:
+    async def _validate_indicator(self, impl_path: Path) -> dict[str, Any]:
        """
        Validate an indicator by running it against synthetic OHLC data.

-        Runs indicator_harness.py in-process via a thread. Catches import errors,
-        runtime errors, and wrong return types — not just syntax.
+        Runs indicator_harness.py in-process via a thread (main proc). Catches
+        import errors, runtime errors, and wrong return types — not just syntax.
        """
-        meta_path = impl_path.parent / "metadata.json"
-        return self._execute_indicator(impl_path.parent, timeout=30)
+        return await self._execute_indicator(impl_path.parent, timeout=30)

-    def _execute_indicator(self, item_dir: Path, timeout: int = 30) -> dict[str, Any]:
+    async def _execute_indicator(self, item_dir: Path, timeout: int = 30) -> dict[str, Any]:
        """
        Run an indicator against synthetic OHLC data in-process via a thread.

@@ -883,29 +898,32 @@ class CategoryFileManager:
            return {"success": False, "error": "metadata.json not found"}

        from dexorder.tools.indicator_harness import run as _indicator_run
-        result = _run_inprocess(_indicator_run, impl_path, meta_path, timeout=timeout)
+        result = await run_in_thread(_indicator_run, impl_path, meta_path, timeout=timeout)

        if result.get("_timeout"):
            return {"success": False, "error": f"Indicator test timed out after {timeout}s"}
        return result

-    def _run_research_harness(self, impl_path: Path, item_dir: Path, timeout: int = 300) -> dict[str, Any]:
+    async def _run_research_harness(self, impl_path: Path, item_dir: Path, timeout: int = 300) -> dict[str, Any]:
        """
-        Run a research script in-process via a thread and return captured results.
+        Run a research script via research_harness.py subprocess and return captured results.

        Returns:
-            dict with stdout, stderr, images, error fields — or an error dict.
+            dict with stdout, stderr, images, error fields.
        """
-        from dexorder.tools.research_harness import run as _research_run
-        return _run_inprocess(_research_run, impl_path, item_dir, timeout=timeout)
+        return await run_subprocess_argv(
+            sys.executable, str(_RESEARCH_HARNESS), str(impl_path),
+            timeout=timeout,
+            cwd=item_dir,
+        )

-    def _validate_research(self, impl_path: Path, item_dir: Path) -> dict[str, Any]:
+    async def _validate_research(self, impl_path: Path, item_dir: Path) -> dict[str, Any]:
        """
        Validate a research script.

        Runs the script via the harness and captures output + pyplot images.
        """
-        data = self._run_research_harness(impl_path, item_dir, timeout=300)
+        data = await self._run_research_harness(impl_path, item_dir, timeout=300)

        if data.get("_timeout"):
            return {"success": False, "error": "Research script timeout"}
@@ -923,7 +941,7 @@ class CategoryFileManager:
            "images": data["images"],
        }

-    def execute_research(self, name: str) -> dict[str, Any]:
+    async def execute_research(self, name: str) -> dict[str, Any]:
        """
        Execute a research script and return structured content with images.

@@ -944,7 +962,7 @@ class CategoryFileManager:
        if not impl_path.exists():
            return {"error": f"Implementation file not found for '{name}'"}

-        data = self._run_research_harness(impl_path, item_dir, timeout=300)
+        data = await self._run_research_harness(impl_path, item_dir, timeout=300)

        if data.get("_timeout"):
            log.error(f"execute_research '{name}': timeout")
@@ -995,7 +1013,7 @@ class CategoryFileManager:
        return {"content": content}


-    def delete(self, category: str, name: str) -> dict[str, Any]:
+    async def delete(self, category: str, name: str) -> dict[str, Any]:
        """
        Delete a category script directory and commit the removal to git.

@@ -1031,13 +1049,13 @@ class CategoryFileManager:
        except Exception as e:
            return {"success": False, "error": f"Failed to delete: {e}"}

-        commit_hash = self.git.commit(f"delete({category}): {name}")
+        commit_hash = await self.git.commit_async(f"delete({category}): {name}")
        result: dict[str, Any] = {"success": True, "category": category, "name": name}
        if commit_hash:
            result["revision"] = commit_hash
        return result

-    def git_log(
+    async def git_log(
        self,
        category: Optional[str] = None,
        name: Optional[str] = None,
@@ -1061,10 +1079,10 @@ class CategoryFileManager:
                path = get_category_path(self.src_dir, cat, name)
            else:
                path = self.src_dir / cat.value
-        entries = self.git.log(path=path, n=limit)
+        entries = await self.git.log_async(path=path, n=limit)
        return {"success": True, "commits": entries}

-    def git_revert(self, revision: str, category: str, name: str) -> dict[str, Any]:
+    async def git_revert(self, revision: str, category: str, name: str) -> dict[str, Any]:
        """
        Restore a category item to a previous git revision (creates a new commit).

@@ -1085,11 +1103,11 @@ class CategoryFileManager:
            return {"success": False, "error": f"Item '{name}' not found in '{category}'"}

        try:
-            commit_hash = self.git.restore(revision, path=item_dir)
+            commit_hash = await self.git.restore_async(revision, path=item_dir)
        except RuntimeError as e:
            return {"success": False, "error": str(e)}

-        validation = self._validate(cat, item_dir)
+        validation = await self._validate(cat, item_dir)
        return {
            "success": validation["success"],
            "revision": commit_hash,
--- a/sandbox/dexorder/tools/research_harness.py
+++ b/sandbox/dexorder/tools/research_harness.py
@@ -119,11 +119,39 @@ def run(impl_path: Path, item_dir: Path) -> dict:
    stdout_buf = io.StringIO()
    stderr_buf = io.StringIO()

+    # Eagerly capture figures when user scripts call plt.close() so images are
+    # not lost even if the script closes figures immediately after savefig().
+    captured_images: list[dict] = []
+
+    def _capture_fig(fig) -> None:
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+        buf.seek(0)
+        captured_images.append({"format": "png", "data": base64.b64encode(buf.read()).decode('utf-8')})
+        buf.close()
+
+    _orig_plt_close = plt.close
+
+    def _patched_close(fig=None):
+        if fig is None:
+            for fn in plt.get_fignums():
+                _capture_fig(plt.figure(fn))
+        elif fig == 'all':
+            for fn in plt.get_fignums():
+                _capture_fig(plt.figure(fn))
+        else:
+            try:
+                _capture_fig(fig if hasattr(fig, 'savefig') else plt.figure(fig))
+            except Exception:
+                pass
+        _orig_plt_close(fig)
+
    error_occurred = False
    old_stdout, old_stderr = sys.stdout, sys.stderr
    old_cwd = os.getcwd()
    sys.stdout = stdout_buf
    sys.stderr = stderr_buf
+    plt.close = _patched_close

    try:
        os.chdir(impl_path.parent)
@@ -136,22 +164,26 @@ def run(impl_path: Path, item_dir: Path) -> dict:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        os.chdir(old_cwd)
+        plt.close = _orig_plt_close

    stdout_output = stdout_buf.getvalue()
    stderr_output = stderr_buf.getvalue()

    # ---------------------------------------------------------------------------
-    # Capture matplotlib figures
+    # Capture any figures still open after script completion
    # ---------------------------------------------------------------------------
-    images = []
+    images = captured_images
    if not error_occurred:
+        already_seen = {img["data"] for img in images}
        for fig_num in plt.get_fignums():
            fig = plt.figure(fig_num)
            buf = io.BytesIO()
            fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
            buf.seek(0)
-            images.append({"format": "png", "data": base64.b64encode(buf.read()).decode('utf-8')})
+            data = base64.b64encode(buf.read()).decode('utf-8')
            buf.close()
+            if data not in already_seen:
+                images.append({"format": "png", "data": data})
    plt.close('all')

    return {
--- a/sandbox/dexorder/tools/subprocess_runner.py
+++ b/sandbox/dexorder/tools/subprocess_runner.py
@@ -0,0 +1,182 @@
+"""
+subprocess_runner — non-blocking subprocess primitives for the MCP sandbox.
+
+All three entrypoints return the same dict shape as the legacy _run_inprocess():
+    {
+        "error":    bool,
+        "stdout":   str,
+        "stderr":   str,
+        "images":   list,        # always [] for non-research invocations
+        "_timeout": bool         # present and True only on timeout
+    }
+
+Callers can therefore pattern-match on {"_timeout", "error", "stdout", "stderr"}
+uniformly regardless of whether the work ran in a subprocess or a thread.
+"""
+
+import asyncio
+import json
+import traceback
+from pathlib import Path
+from typing import Any, Callable
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+def _normalise(data: dict, stderr_fallback: str = "") -> dict:
+    """Ensure the standard shape keys are present in a harness result dict."""
+    data.setdefault("error", False)
+    data.setdefault("stdout", "")
+    data.setdefault("stderr", stderr_fallback)
+    data.setdefault("images", [])
+    return data
+
+
+def _err_dict(stderr: str = "", stdout: str = "") -> dict:
+    return {"error": True, "stdout": stdout, "stderr": stderr, "images": []}
+
+
+def _timeout_dict() -> dict:
+    return {"_timeout": True, "error": True, "stdout": "", "stderr": "", "images": []}
+
+
+# ---------------------------------------------------------------------------
+# Primitive 1: run_subprocess_argv
+#
+# Non-blocking equivalent of:
+#   subprocess.run([sys.executable, harness, arg1, arg2, ...],
+#                  capture_output=True, text=True, timeout=N, cwd=cwd)
+#
+# Used by: _execute_strategy, _run_research_harness
+# ---------------------------------------------------------------------------
+
+async def run_subprocess_argv(
+    *cmd: str,
+    timeout: int,
+    cwd: Path | None = None,
+) -> dict:
+    """
+    Spawn cmd as a subprocess, await completion, and return a normalised result dict.
+
+    stdout is expected to contain a JSON object written by the harness.  It is
+    decoded and normalised to the standard shape.  On JSON decode failure the
+    raw stdout text is preserved in "stdout" and error is set to True.
+    """
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            cwd=str(cwd) if cwd else None,
+        )
+        stdout_bytes, stderr_bytes = await asyncio.wait_for(
+            proc.communicate(), timeout=timeout
+        )
+    except asyncio.TimeoutError:
+        return _timeout_dict()
+    except Exception as exc:
+        return _err_dict(stderr=f"Harness launch failed: {exc}")
+
+    stdout_text = stdout_bytes.decode(errors="replace")
+    stderr_text = stderr_bytes.decode(errors="replace")
+
+    if proc.returncode != 0:
+        return _err_dict(
+            stderr=f"Harness exited {proc.returncode}:\n{stderr_text}",
+            stdout=stdout_text,
+        )
+
+    try:
+        data = json.loads(stdout_text)
+        return _normalise(data, stderr_fallback=stderr_text)
+    except json.JSONDecodeError:
+        return {"error": True, "stdout": stdout_text, "stderr": stderr_text, "images": []}
+
+
+# ---------------------------------------------------------------------------
+# Primitive 2: run_subprocess_stdin
+#
+# Non-blocking equivalent of the backtest pattern — JSON config fed via stdin.
+# ---------------------------------------------------------------------------
+
+async def run_subprocess_stdin(
+    *cmd: str,
+    stdin_data: bytes,
+    timeout: int,
+) -> dict:
+    """
+    Spawn cmd, write stdin_data to its stdin, await completion.
+
+    Returns the same normalised dict shape as run_subprocess_argv.
+    """
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout_bytes, stderr_bytes = await asyncio.wait_for(
+            proc.communicate(stdin_data), timeout=timeout
+        )
+    except asyncio.TimeoutError:
+        return _timeout_dict()
+    except Exception as exc:
+        return _err_dict(stderr=f"Harness launch failed: {exc}")
+
+    stdout_text = stdout_bytes.decode(errors="replace")
+    stderr_text = stderr_bytes.decode(errors="replace")
+
+    if proc.returncode != 0:
+        return _err_dict(
+            stderr=f"Harness exited {proc.returncode}:\n{stderr_text}",
+            stdout=stdout_text,
+        )
+
+    try:
+        data = json.loads(stdout_text)
+        return _normalise(data, stderr_fallback=stderr_text)
+    except json.JSONDecodeError:
+        return {"error": True, "stdout": stdout_text, "stderr": stderr_text, "images": []}
+
+
+# ---------------------------------------------------------------------------
+# Primitive 3: run_in_thread
+#
+# Async wrapper around asyncio.to_thread so the event loop stays responsive
+# while CPU-bound or blocking-IO callables run in a worker thread.
+#
+# Used by: _execute_indicator (in-process indicator harness)
+# ---------------------------------------------------------------------------
+
+async def run_in_thread(
+    fn: Callable,
+    *args: Any,
+    timeout: int,
+) -> dict:
+    """
+    Run fn(*args) in a thread pool worker and yield to the event loop while waiting.
+
+    On timeout the thread is abandoned (daemon) and _timeout_dict() is returned.
+    On MemoryError or unexpected exception a standard error dict is returned.
+    The returned dict is normalised to the standard shape.
+    """
+    from dexorder.memory_guard import cleanup_memory
+
+    try:
+        result = await asyncio.wait_for(
+            asyncio.to_thread(fn, *args),
+            timeout=timeout,
+        )
+        return _normalise(result)
+    except asyncio.TimeoutError:
+        return _timeout_dict()
+    except MemoryError:
+        cleanup_memory()
+        return _err_dict(
+            stderr="Script exceeded memory limit. Try reducing the data range or batch size."
+        )
+    except Exception:
+        return _err_dict(stderr=traceback.format_exc())