data pipeline refactor and fix

2026-04-13 18:30:04 -04:00
parent 6418729b16
commit 326bf80846
96 changed files with 7107 additions and 1763 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 /backend.old/data
 /backend.old/uploads/
 chat/
+bin/create-all-users

 # Environment variables
 .env
@@ -114,6 +115,9 @@ deploy/k8s/prod/secrets/*.yaml
 # Dev environment image tags
 .dev-image-tag

+# Dev gateway-config is generated from gateway-config.yaml.tpl by bin/dev
+deploy/k8s/dev/configs/gateway-config.yaml
+
 # Protobuf copies (canonical files are in /protobuf/)
 flink/protobuf/
 relay/protobuf/
--- a/bin/create-user
+++ b/bin/create-user
@@ -21,6 +21,10 @@ usage() {
 }

 ENV="${1:-dev}"
+ARG_EMAIL="${2:-}"
+ARG_PASSWORD="${3:-}"
+ARG_NAME="${4:-}"
+ARG_LICENSE="${5:-}"

 if [[ "$ENV" != "dev" && "$ENV" != "prod" ]]; then
    echo -e "${RED}Error: Environment must be 'dev' or 'prod'${NC}"
@@ -44,16 +48,36 @@ if [ -z "$PG_POD" ]; then
    exit 1
 fi

-# Prompt for credentials
-read -p "Email: " USER_EMAIL
-read -rs -p "Password (min 8 chars): " USER_PASSWORD
-echo ""
+# Get credentials — from args or interactively
+if [[ -n "$ARG_EMAIL" ]]; then
+    USER_EMAIL="$ARG_EMAIL"
+else
+    read -p "Email: " USER_EMAIL
+fi
+
+if [[ -n "$ARG_PASSWORD" ]]; then
+    USER_PASSWORD="$ARG_PASSWORD"
+else
+    read -rs -p "Password (min 8 chars): " USER_PASSWORD
+    echo ""
+fi
+
 if [[ ${#USER_PASSWORD} -lt 8 ]]; then
    echo -e "${RED}✗ Password must be at least 8 characters${NC}"
    exit 1
 fi
-read -p "Display name: " USER_NAME
-read -p "License type [free|pro|enterprise] (default: pro): " LICENSE_TYPE
+
+if [[ -n "$ARG_NAME" ]]; then
+    USER_NAME="$ARG_NAME"
+else
+    read -p "Display name: " USER_NAME
+fi
+
+if [[ -n "$ARG_LICENSE" ]]; then
+    LICENSE_TYPE="$ARG_LICENSE"
+else
+    read -p "License type [free|pro|enterprise] (default: pro): " LICENSE_TYPE
+fi
 LICENSE_TYPE="${LICENSE_TYPE:-pro}"

 # Check if user already exists
--- a/bin/deploy
+++ b/bin/deploy
@@ -43,7 +43,7 @@ if [ "$PROJECT" == "dev" ]; then
 fi

 if [ "$DEV" == "1" ]; then
-  TAG="dev`date +%Y%m%d%H%M%S`"
+  TAG="dev`date -u +%Y%m%d%H%M%S`"
  if [ "$1" != "" ]; then
    CONFIG=$1
    shift
--- a/bin/deploy-all
+++ b/bin/deploy-all
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+KUBECTL="kubectl --context=prod"
+CLEAR_SANDBOXES=0
+
+usage() {
+    echo "Usage: $0 [--sandboxes]"
+    echo ""
+    echo "Deploy all services to production. Does NOT update secrets (use bin/secret-update)."
+    echo ""
+    echo "Steps performed:"
+    echo "  1. Apply base kustomize manifests (namespaces, RBAC, policies)"
+    echo "  2. Apply infrastructure.yaml (statefulsets, deployments)"
+    echo "  3. Run bin/config-update prod"
+    echo "  4. Build and deploy all application images"
+    echo "  5. Wait for rollouts"
+    echo ""
+    echo "Options:"
+    echo "  --sandboxes   Delete sandbox Deployments and Services (PVCs are retained)."
+    echo "                      The gateway will recreate sandboxes on next user login."
+    echo ""
+    exit 1
+}
+
+for arg in "$@"; do
+    case "$arg" in
+        --sandboxes)
+            CLEAR_SANDBOXES=1
+            ;;
+        --help|-h)
+            usage
+            ;;
+        *)
+            echo -e "${RED}Unknown argument: $arg${NC}"
+            usage
+            ;;
+    esac
+done
+
+echo -e "${YELLOW}╔══════════════════════════════════════════╗${NC}"
+echo -e "${YELLOW}║   PRODUCTION FULL DEPLOY                 ║${NC}"
+echo -e "${YELLOW}╚══════════════════════════════════════════╝${NC}"
+echo ""
+echo -e "${YELLOW}⚠️  This will update ALL production services.${NC}"
+echo -e "${YELLOW}   Secrets are NOT updated (run bin/secret-update prod separately).${NC}"
+if [ "$CLEAR_SANDBOXES" == "1" ]; then
+    echo -e "${YELLOW}   Sandbox deployments will be DELETED (PVCs retained).${NC}"
+fi
+echo ""
+read -p "Are you sure you want to continue? (yes/no): " confirm
+if [[ "$confirm" != "yes" ]]; then
+    echo "Aborted."
+    exit 0
+fi
+
+step() {
+    echo ""
+    echo -e "${BLUE}━━━ $1 ━━━${NC}"
+}
+
+ok() {
+    echo -e "${GREEN}✓${NC} $1"
+}
+
+fail() {
+    echo -e "${RED}✗ $1${NC}"
+    exit 1
+}
+
+# ── Step 1: Base kustomize manifests ─────────────────────────────────────────
+step "Step 1/5: Applying base kustomize manifests"
+cd "$ROOT_DIR"
+$KUBECTL apply -k deploy/k8s/prod/
+ok "Base manifests applied (namespaces, RBAC, policies, quotas)"
+
+# ── Step 2: Infrastructure ────────────────────────────────────────────────────
+step "Step 2/5: Applying infrastructure.yaml"
+$KUBECTL -n ai apply -f deploy/k8s/prod/infrastructure.yaml
+ok "Infrastructure applied"
+
+# ── Step 3: Configs ───────────────────────────────────────────────────────────
+step "Step 3/5: Updating configs"
+# config-update prod will prompt for confirmation; we already confirmed above,
+# so feed "yes" automatically via stdin.
+echo "yes" | "$SCRIPT_DIR/config-update" prod
+ok "Configs updated"
+
+# ── Step 4: Build and deploy all application images ───────────────────────────
+step "Step 4/5: Building and deploying application images"
+echo ""
+
+SERVICES=(gateway web sandbox lifecycle-sidecar flink relay ingestor)
+
+for service in "${SERVICES[@]}"; do
+    echo -e "${GREEN}→${NC} Deploying $service..."
+    "$SCRIPT_DIR/deploy" "$service" prod
+    ok "$service deployed"
+    echo ""
+done
+
+# ── Step 4b: Optionally clear sandbox deployments ─────────────────────────────
+if [ "$CLEAR_SANDBOXES" == "1" ]; then
+    step "Step 4b: Clearing sandbox deployments"
+    SANDBOX_DEPLOYS=$($KUBECTL -n sandbox get deployments -o name 2>/dev/null || true)
+    SANDBOX_SVCS=$($KUBECTL -n sandbox get services -o name 2>/dev/null || true)
+
+    if [ -z "$SANDBOX_DEPLOYS" ]; then
+        echo "  No sandbox deployments found."
+    else
+        echo "  Deleting sandbox deployments..."
+        echo "$SANDBOX_DEPLOYS" | xargs $KUBECTL -n sandbox delete
+        ok "Sandbox deployments deleted"
+    fi
+
+    if [ -n "$SANDBOX_SVCS" ]; then
+        echo "  Deleting sandbox services..."
+        echo "$SANDBOX_SVCS" | xargs $KUBECTL -n sandbox delete
+        ok "Sandbox services deleted"
+    fi
+
+    echo -e "${YELLOW}  PVCs retained — gateway will recreate sandboxes on next login.${NC}"
+fi
+
+# ── Step 5: Wait for rollouts ─────────────────────────────────────────────────
+step "Step 5/5: Waiting for rollouts"
+
+ROLLOUTS=(
+    "deployment/gateway"
+    "deployment/ai-web"
+    "deployment/relay"
+    "deployment/ingestor"
+    "deployment/flink-jobmanager"
+    "deployment/flink-taskmanager"
+)
+
+for r in "${ROLLOUTS[@]}"; do
+    echo -e "${GREEN}→${NC} Waiting for $r..."
+    $KUBECTL -n ai rollout status "$r" --timeout=180s || echo -e "${YELLOW}  ⚠ $r did not become ready within 3 minutes${NC}"
+done
+
+echo ""
+echo -e "${GREEN}╔══════════════════════════════════════════╗${NC}"
+echo -e "${GREEN}║   Deploy complete!                       ║${NC}"
+echo -e "${GREEN}╚══════════════════════════════════════════╝${NC}"
+echo ""
+echo "  Verify: curl -I https://dexorder.ai/api/health"
+echo ""
--- a/bin/dev
+++ b/bin/dev
@@ -99,6 +99,12 @@ start_minikube() {
    fi
 }

+generate_gateway_config_dev() {
+    sed "s|SANDBOX_IMAGE_TAG|dexorder/ai-sandbox:$SANDBOX_TAG|g; s|SIDECAR_IMAGE_TAG|dexorder/ai-lifecycle-sidecar:$SIDECAR_TAG|g" \
+        "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml.tpl" \
+        > "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
+}
+
 rebuild_images() {
    local service="${1:-all}"
    echo -e "${BLUE}Building custom images...${NC}"
@@ -221,12 +227,7 @@ deploy_services() {
    # Update configs
    echo -e "${GREEN}→${NC} Updating configs..."

-    # Template gateway-config.yaml with actual image tags (backup first for safe restore)
-    local _gw_bak
-    _gw_bak=$(mktemp)
-    cp "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml" "$_gw_bak"
-    sed -i "s|sandbox_image: dexorder/ai-sandbox:.*|sandbox_image: dexorder/ai-sandbox:$SANDBOX_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
-    sed -i "s|sidecar_image: dexorder/ai-lifecycle-sidecar:.*|sidecar_image: dexorder/ai-lifecycle-sidecar:$SIDECAR_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
+    generate_gateway_config_dev

    "$SCRIPT_DIR/config-update" dev

@@ -264,10 +265,6 @@ EOF
    # Clean up the appended image tags from kustomization.yaml
    sed -i '/# Image tags (added by bin\/dev)/,$d' kustomization.yaml

-    # Restore gateway-config.yaml from backup
-    cp "$_gw_bak" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
-    rm "$_gw_bak"
-
    echo -e "${GREEN}✓ Services deployed${NC}"

    echo ""
@@ -525,6 +522,9 @@ deep_restart() {
            ;;
    esac

+    echo -e "${GREEN}→${NC} Rebuilding application images..."
+    rebuild_images
+
    echo -e "${GREEN}→${NC} Redeploying services..."
    deploy_services

@@ -589,11 +589,7 @@ deploy_service() {
        gateway)
            image_name="dexorder/ai-gateway"
            image_tag="$GATEWAY_TAG"
-            # Also need to template gateway-config.yaml (backup for safe restore)
-            _gw_bak_single=$(mktemp)
-            cp "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml" "$_gw_bak_single"
-            sed -i "s|sandbox_image: dexorder/ai-sandbox:.*|sandbox_image: dexorder/ai-sandbox:$SANDBOX_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
-            sed -i "s|sidecar_image: dexorder/ai-lifecycle-sidecar:.*|sidecar_image: dexorder/ai-lifecycle-sidecar:$SIDECAR_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
+            generate_gateway_config_dev
            "$SCRIPT_DIR/config-update" dev
            ;;
        web)
@@ -623,12 +619,6 @@ EOF
    # Clean up the appended image tags from kustomization.yaml
    sed -i '/# Image tags (added by bin\/dev)/,$d' kustomization.yaml

-    # Restore gateway-config.yaml from backup if we modified it
-    if [ "$service" == "gateway" ]; then
-        cp "$_gw_bak_single" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
-        rm "$_gw_bak_single"
-    fi
-
    echo -e "${GREEN}✓ $service deployed${NC}"
 }

@@ -713,15 +703,10 @@ case "$COMMAND" in

                cd "$ROOT_DIR/deploy/k8s/dev"

-                # Template gateway-config if gateway is in the list (backup for safe restore)
-                _ms_gw_bak=""
+                # Regenerate gateway-config if gateway is in the list
                for svc in "${deploy_services_list[@]}"; do
                    if [ "$svc" == "gateway" ]; then
-                        _ms_gw_bak=$(mktemp)
-                        cp "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml" "$_ms_gw_bak"
-                        sed -i "s|sandbox_image: dexorder/ai-sandbox:.*|sandbox_image: dexorder/ai-sandbox:$SANDBOX_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
-                        sed -i "s|sidecar_image: dexorder/ai-lifecycle-sidecar:.*|sidecar_image: dexorder/ai-lifecycle-sidecar:$SIDECAR_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
-                        "$SCRIPT_DIR/config-update" dev
+                        generate_gateway_config_dev
                        break
                    fi
                done
@@ -744,11 +729,6 @@ case "$COMMAND" in

                sed -i '/# Image tags (added by bin\/dev)/,$d' kustomization.yaml

-                # Restore gateway-config from backup if we modified it
-                if [ -n "$_ms_gw_bak" ]; then
-                    cp "$_ms_gw_bak" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
-                    rm "$_ms_gw_bak"
-                fi
            fi

            # Handle sandbox separately
--- a/bin/init
+++ b/bin/init
@@ -45,6 +45,29 @@ else
    MCP_URL="http://localhost:8080/mcp"
 fi

+# ---------- MinIO Bucket Initialization ----------
+
+echo ""
+echo -e "${BLUE}=== MinIO Storage Setup ===${NC}"
+echo ""
+
+echo -e "${BLUE}Waiting for MinIO pod...${NC}"
+$KUBECTL wait --for=condition=ready --timeout=120s pod -l app=minio 2>/dev/null || {
+    echo -e "${YELLOW}⚠️  MinIO not ready after 120s, skipping bucket setup${NC}"
+}
+
+MINIO_POD=$($KUBECTL get pods -l app=minio -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
+if [ -n "$MINIO_POD" ]; then
+    echo -e "${GREEN}→${NC} Ensuring warehouse bucket exists..."
+    MINIO_USER=$($KUBECTL exec "$MINIO_POD" -- sh -c 'echo $MINIO_ROOT_USER' 2>/dev/null | tr -d '\r')
+    MINIO_PASS=$($KUBECTL exec "$MINIO_POD" -- sh -c 'echo $MINIO_ROOT_PASSWORD' 2>/dev/null | tr -d '\r')
+    $KUBECTL exec "$MINIO_POD" -- mc alias set local http://localhost:9000 "$MINIO_USER" "$MINIO_PASS" > /dev/null 2>&1
+    $KUBECTL exec "$MINIO_POD" -- mc mb --ignore-existing local/warehouse > /dev/null 2>&1
+    echo -e "${GREEN}✓ Warehouse bucket ready${NC}"
+else
+    echo -e "${YELLOW}⚠️  MinIO pod not found, skipping bucket setup${NC}"
+fi
+
 # ---------- Schema Initialization ----------

 echo ""
--- a/deploy/k8s/dev/configs/gateway-config.yaml.tpl
+++ b/deploy/k8s/dev/configs/gateway-config.yaml.tpl
@@ -56,8 +56,8 @@ data:
      namespace: sandbox
      service_namespace: default
      in_cluster: true
-      sandbox_image: dexorder/ai-sandbox:dev20260409143116
-      sidecar_image: dexorder/ai-lifecycle-sidecar:dev20260408103634
+      sandbox_image: SANDBOX_IMAGE_TAG
+      sidecar_image: SIDECAR_IMAGE_TAG
      storage_class: standard
      image_pull_policy: Never  # For minikube dev - use local images

--- a/deploy/k8s/dev/configs/ingestor-config.yaml
+++ b/deploy/k8s/dev/configs/ingestor-config.yaml
@@ -1,9 +1,8 @@
 # CCXT Ingestor Configuration

-# Relay ZMQ endpoints (relay is the well-known gateway)
-flink_hostname: relay
-ingestor_work_port: 5555    # SUB - receives DataRequest with exchange prefix
-# Note: No response port needed - async architecture via Kafka!
+# Flink IngestorBroker (ROUTER) endpoint
+flink_hostname: flink-jobmanager
+ingestor_broker_port: 5567

 # Supported exchanges (subscribe to these prefixes)
 supported_exchanges:
--- a/deploy/k8s/dev/infrastructure.yaml
+++ b/deploy/k8s/dev/infrastructure.yaml
@@ -455,6 +455,14 @@ spec:
      protocol: TCP
      port: 5561
      targetPort: 5561
+    - name: zmq-client-pull
+      protocol: TCP
+      port: 5566
+      targetPort: 5566
+    - name: zmq-ingestor-broker
+      protocol: TCP
+      port: 5567
+      targetPort: 5567
  type: ClusterIP
 ---
 apiVersion: apps/v1
@@ -583,14 +591,6 @@ spec:
  selector:
    app: relay
  ports:
-    - name: work-queue
-      protocol: TCP
-      port: 5555
-      targetPort: 5555
-    - name: responses
-      protocol: TCP
-      port: 5556
-      targetPort: 5556
    - name: market-data
      protocol: TCP
      port: 5558
@@ -620,10 +620,6 @@ spec:
        image: dexorder/ai-relay
        imagePullPolicy: Never
        ports:
-        - containerPort: 5555
-          name: work-queue
-        - containerPort: 5556
-          name: responses
        - containerPort: 5558
          name: market-data
        - containerPort: 5559
@@ -657,9 +653,9 @@ spec:
        app: ingestor
    spec:
      initContainers:
-      - name: wait-for-relay
+      - name: wait-for-flink
        image: busybox:1.36
-        command: ['sh', '-c', 'until nc -z relay 5555; do echo waiting for relay; sleep 2; done;']
+        command: ['sh', '-c', 'until nc -z flink-jobmanager 5567; do echo waiting for flink broker; sleep 2; done;']
      - name: wait-for-kafka
        image: busybox:1.36
        command: ['sh', '-c', 'until nc -z kafka 9092; do echo waiting for kafka; sleep 2; done;']
--- a/deploy/k8s/dev/sandbox-config.yaml
+++ b/deploy/k8s/dev/sandbox-config.yaml
@@ -30,6 +30,7 @@ data:
        namespace: "trading"
        # S3 endpoint for MinIO in default namespace
        s3_endpoint: "http://minio.default.svc.cluster.local:9000"
+        s3_region: "us-east-1"

      relay:
        endpoint: "tcp://relay.default.svc.cluster.local:5559"
--- a/deploy/k8s/prod/configs/ingestor-config.yaml
+++ b/deploy/k8s/prod/configs/ingestor-config.yaml
@@ -1,9 +1,8 @@
 # CCXT Ingestor Configuration

-# Relay ZMQ endpoints (relay is the well-known gateway)
-flink_hostname: relay
-ingestor_work_port: 5555    # SUB - receives DataRequest with exchange prefix
-# Note: No response port needed - async architecture via Kafka!
+# Flink IngestorBroker (ROUTER) endpoint
+flink_hostname: flink-jobmanager
+ingestor_broker_port: 5567

 # Supported exchanges (subscribe to these prefixes)
 supported_exchanges:
--- a/deploy/k8s/prod/infrastructure.yaml
+++ b/deploy/k8s/prod/infrastructure.yaml
@@ -451,6 +451,14 @@ spec:
      protocol: TCP
      port: 5561
      targetPort: 5561
+    - name: zmq-client-pull
+      protocol: TCP
+      port: 5566
+      targetPort: 5566
+    - name: zmq-ingestor-broker
+      protocol: TCP
+      port: 5567
+      targetPort: 5567
  type: ClusterIP
 ---
 apiVersion: apps/v1
@@ -579,14 +587,6 @@ spec:
  selector:
    app: relay
  ports:
-    - name: work-queue
-      protocol: TCP
-      port: 5555
-      targetPort: 5555
-    - name: responses
-      protocol: TCP
-      port: 5556
-      targetPort: 5556
    - name: market-data
      protocol: TCP
      port: 5558
@@ -616,10 +616,6 @@ spec:
        image: dexorder/ai-relay
        imagePullPolicy: Always
        ports:
-        - containerPort: 5555
-          name: work-queue
-        - containerPort: 5556
-          name: responses
        - containerPort: 5558
          name: market-data
        - containerPort: 5559
@@ -653,9 +649,9 @@ spec:
        app: ingestor
    spec:
      initContainers:
-      - name: wait-for-relay
+      - name: wait-for-flink
        image: busybox:1.36
-        command: ['sh', '-c', 'until nc -z relay 5555; do echo waiting for relay; sleep 2; done;']
+        command: ['sh', '-c', 'until nc -z flink-jobmanager 5567; do echo waiting for flink broker; sleep 2; done;']
      - name: wait-for-kafka
        image: busybox:1.36
        command: ['sh', '-c', 'until nc -z kafka 9092; do echo waiting for kafka; sleep 2; done;']
--- a/deploy/k8s/prod/sandbox-config.yaml
+++ b/deploy/k8s/prod/sandbox-config.yaml
@@ -22,6 +22,7 @@ data:
        catalog_uri: "http://iceberg-catalog.ai.svc.cluster.local:8181"
        namespace: "trading"
        s3_endpoint: "http://minio.ai.svc.cluster.local:9000"
+        s3_region: "us-east-1"

      relay:
        endpoint: "tcp://relay.ai.svc.cluster.local:5559"
--- a/flink/src/main/java/com/dexorder/flink/TradingFlinkApp.java
+++ b/flink/src/main/java/com/dexorder/flink/TradingFlinkApp.java
@@ -2,7 +2,8 @@ package com.dexorder.flink;

 import com.dexorder.flink.config.AppConfig;
 import com.dexorder.flink.iceberg.SchemaInitializer;
-import com.dexorder.flink.ingestor.IngestorWorkQueue;
+import com.dexorder.flink.ingestor.IngestorBroker;
+import com.dexorder.flink.ingestor.RealtimeSubscriptionManager;
 import com.dexorder.flink.kafka.TopicManager;
 import com.dexorder.flink.publisher.HistoryNotificationForwarder;
 import com.dexorder.flink.publisher.HistoryNotificationFunction;
@@ -10,6 +11,11 @@ import com.dexorder.flink.publisher.OHLCBatchWrapper;
 import com.dexorder.flink.publisher.OHLCBatchDeserializer;
 import com.dexorder.flink.publisher.MarketWrapper;
 import com.dexorder.flink.publisher.MarketDeserializer;
+import com.dexorder.flink.publisher.RealtimeBar;
+import com.dexorder.flink.publisher.RealtimeBarFunction;
+import com.dexorder.flink.publisher.RealtimeBarPublisher;
+import com.dexorder.flink.publisher.TickWrapper;
+import com.dexorder.flink.publisher.TickDeserializer;
 import com.dexorder.flink.sink.HistoricalBatchWriter;
 import com.dexorder.flink.sink.SymbolMetadataWriter;
 import com.dexorder.flink.zmq.ZmqChannelManager;
@@ -83,11 +89,16 @@ public class TradingFlinkApp {
                    catalogProps
            );

+            String warehouse = config.getString("iceberg_warehouse", "s3://warehouse/");
+            String warehouseBucket = warehouse.replaceFirst("^s3://", "").split("/")[0];
+
            org.apache.iceberg.catalog.Catalog catalog = catalogLoader.loadCatalog();
            try {
                SchemaInitializer schemaInitializer = new SchemaInitializer(
                        catalog,
-                        config.getIcebergNamespace()
+                        config.getIcebergNamespace(),
+                        config.getString("s3_endpoint", "http://minio:9000"),
+                        warehouseBucket
                );
                schemaInitializer.initializeSchemas();
            } finally {
@@ -107,20 +118,28 @@ public class TradingFlinkApp {
            zmqManager.initializeChannels();
            LOG.info("ZeroMQ channels initialized");

-            // Initialize history notification forwarder (runs in job manager)
-            // Binds PULL socket to receive notifications from task managers, forwards to MARKET_DATA_PUB
+            // Initialize ingestor broker — manages ROUTER/DEALER work queue for all ingestors
+            IngestorBroker broker = new IngestorBroker(zmqManager);
+            broker.start();
+            LOG.info("IngestorBroker started");
+
+            // Initialize realtime subscription manager — owns MARKET_DATA_PUB socket exclusively,
+            // detects XPUB subscription events, and calls broker for realtime job lifecycle.
+            // Other components publish via subscriptionManager.enqueuePublish() (thread-safe).
+            RealtimeSubscriptionManager subscriptionManager = new RealtimeSubscriptionManager(zmqManager, broker);
+            subscriptionManager.start();
+            LOG.info("RealtimeSubscriptionManager started");
+
+            // Initialize history notification forwarder (runs in job manager).
+            // Binds PULL socket to receive notifications from task managers, enqueues them for
+            // publication via RealtimeSubscriptionManager (sole owner of MARKET_DATA_PUB).
            HistoryNotificationForwarder notificationForwarder = new HistoryNotificationForwarder(
                    config.getNotificationPullPort(),
-                    zmqManager.getSocket(ZmqChannelManager.Channel.MARKET_DATA_PUB)
+                    subscriptionManager::enqueuePublish
            );
            notificationForwarder.start();
            LOG.info("History notification forwarder started on port {}", config.getNotificationPullPort());

-            // Initialize ingestor work queue
-            IngestorWorkQueue workQueue = new IngestorWorkQueue(zmqManager);
-            workQueue.start();
-            LOG.info("Ingestor work queue started");
-
            // Set up Flink streaming environment
            StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

@@ -224,8 +243,37 @@ public class TradingFlinkApp {

            LOG.info("Symbol metadata pipeline configured: SymbolMetadataWriter -> Iceberg -> METADATA_UPDATE notification");

+            // Realtime tick pipeline: Kafka market-tick → OHLC bars → ZMQ notify → clients
+            KafkaSource<TickWrapper> tickSource = KafkaSource.<TickWrapper>builder()
+                    .setBootstrapServers(config.getKafkaBootstrapServers())
+                    .setTopics(config.getKafkaTickTopic())
+                    .setGroupId("flink-tick-consumer")
+                    .setStartingOffsets(OffsetsInitializer.latest())
+                    .setValueOnlyDeserializer(new TickDeserializer())
+                    .build();
+
+            DataStream<TickWrapper> tickStream = env
+                    .fromSource(tickSource, WatermarkStrategy.noWatermarks(), "Tick Kafka Source")
+                    .filter(t -> t != null)
+                    .setParallelism(1);
+
+            // Aggregate ticks into OHLC bars for each configured period.
+            // keyBy ticker so all ticks for a ticker land on the same slot and accumulate together.
+            int[] periods = config.getRealtimePeriods();
+
+            DataStream<RealtimeBar> barStream = tickStream
+                    .keyBy(TickWrapper::getTicker)
+                    .flatMap(new RealtimeBarFunction(periods))
+                    .setParallelism(1);
+
+            barStream.addSink(new RealtimeBarPublisher(notificationEndpoint))
+                    .setParallelism(1)
+                    .name("RealtimeBarPublisher");
+
+            LOG.info("Realtime tick pipeline configured: market-tick → OHLC bars → clients (periods={})",
+                    java.util.Arrays.toString(periods));
+
            // TODO: Set up CEP patterns and triggers
-            // TODO: Set up realtime tick processing

            LOG.info("Flink job configured, starting execution");

@@ -233,15 +281,10 @@ public class TradingFlinkApp {
            Runtime.getRuntime().addShutdownHook(new Thread(() -> {
                LOG.info("Shutting down Trading Flink Application");
                try {
-                    // Stop work queue
-                    workQueue.stop();
-
-                    // Stop notification forwarder
                    notificationForwarder.close();
-
-                    // Close ZMQ channels
+                    subscriptionManager.stop();
+                    broker.stop();
                    zmqManager.close();
-
                    LOG.info("Shutdown complete");
                } catch (Exception e) {
                    LOG.error("Error during shutdown", e);
--- a/flink/src/main/java/com/dexorder/flink/config/AppConfig.java
+++ b/flink/src/main/java/com/dexorder/flink/config/AppConfig.java
@@ -91,14 +91,20 @@ public class AppConfig {
    }

    // ZMQ port getters
-    public int getIngestorWorkQueuePort() {
-        return getInt("zmq_ingestor_work_queue_port", 5555);
-    }
-
    public int getMarketDataPubPort() {
        return getInt("zmq_market_data_pub_port", 5558);
    }

+    /** Port where Flink's IngestorBroker binds a PULL socket to receive requests from relay PUSH */
+    public int getFlinkRequestPullPort() {
+        return getInt("zmq_flink_request_pull_port", 5566);
+    }
+
+    /** Port where Flink's IngestorBroker binds a ROUTER for ingestor DEALER connections */
+    public int getIngestorBrokerPort() {
+        return getInt("zmq_ingestor_broker_port", 5567);
+    }
+
    public String getBindAddress() {
        return getString("zmq_bind_address", "tcp://*");
    }
@@ -112,6 +118,20 @@ public class AppConfig {
        return getString("kafka_tick_topic", "market-tick");
    }

+    /**
+     * Comma-separated OHLC period lengths in seconds for realtime bar computation.
+     * Default covers common chart periods: 1m, 5m, 15m, 1h, 4h, 1d.
+     */
+    public int[] getRealtimePeriods() {
+        String raw = getString("realtime_periods", "60,300,900,3600,14400,86400");
+        String[] parts = raw.split(",");
+        int[] periods = new int[parts.length];
+        for (int i = 0; i < parts.length; i++) {
+            periods[i] = Integer.parseInt(parts[i].trim());
+        }
+        return periods;
+    }
+
    public String getKafkaOhlcTopic() {
        return getString("kafka_ohlc_topic", "market-ohlc");
    }
--- a/flink/src/main/java/com/dexorder/flink/iceberg/SchemaInitializer.java
+++ b/flink/src/main/java/com/dexorder/flink/iceberg/SchemaInitializer.java
@@ -9,8 +9,16 @@ import org.apache.iceberg.catalog.TableIdentifier;
 import org.apache.iceberg.types.Types;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider;
+import software.amazon.awssdk.regions.Region;
+import software.amazon.awssdk.services.s3.S3Client;
+import software.amazon.awssdk.services.s3.S3Configuration;
+import software.amazon.awssdk.services.s3.model.CreateBucketRequest;
+import software.amazon.awssdk.services.s3.model.HeadBucketRequest;
+import software.amazon.awssdk.services.s3.model.NoSuchBucketException;

 import java.io.IOException;
+import java.net.URI;

 import static org.apache.iceberg.types.Types.NestedField.optional;
 import static org.apache.iceberg.types.Types.NestedField.required;
@@ -26,10 +34,14 @@ public class SchemaInitializer {

    private final Catalog catalog;
    private final String namespace;
+    private final String s3Endpoint;
+    private final String warehouseBucket;

-    public SchemaInitializer(Catalog catalog, String namespace) {
+    public SchemaInitializer(Catalog catalog, String namespace, String s3Endpoint, String warehouseBucket) {
        this.catalog = catalog;
        this.namespace = namespace;
+        this.s3Endpoint = s3Endpoint;
+        this.warehouseBucket = warehouseBucket;
    }

    /**
@@ -40,6 +52,9 @@ public class SchemaInitializer {
    public void initializeSchemas() throws IOException {
        LOG.info("Initializing Iceberg schemas in namespace: {}", namespace);

+        // Ensure S3 bucket exists before attempting to create tables
+        ensureS3BucketExists();
+
        // Ensure namespace exists
        ensureNamespaceExists();

@@ -52,6 +67,36 @@ public class SchemaInitializer {
        LOG.info("Schema initialization completed successfully");
    }

+    /**
+     * Ensure the S3 warehouse bucket exists, creating it if necessary.
+     * Runs before any table creation so a fresh MinIO deployment doesn't crash Flink.
+     */
+    private void ensureS3BucketExists() {
+        if (s3Endpoint == null || warehouseBucket == null || warehouseBucket.isEmpty()) {
+            LOG.warn("S3 endpoint or warehouse bucket not configured, skipping bucket check");
+            return;
+        }
+        LOG.info("Ensuring S3 bucket '{}' exists at {}", warehouseBucket, s3Endpoint);
+        try (S3Client s3 = S3Client.builder()
+                .endpointOverride(URI.create(s3Endpoint))
+                .region(Region.of("us-east-1"))
+                .serviceConfiguration(S3Configuration.builder().pathStyleAccessEnabled(true).build())
+                .credentialsProvider(DefaultCredentialsProvider.create())
+                .build()) {
+            try {
+                s3.headBucket(HeadBucketRequest.builder().bucket(warehouseBucket).build());
+                LOG.info("S3 bucket '{}' already exists", warehouseBucket);
+            } catch (NoSuchBucketException e) {
+                LOG.warn("S3 bucket '{}' not found — creating it now", warehouseBucket);
+                s3.createBucket(CreateBucketRequest.builder().bucket(warehouseBucket).build());
+                LOG.info("Created S3 bucket '{}'", warehouseBucket);
+            }
+        } catch (Exception e) {
+            LOG.error("Failed to ensure S3 bucket '{}' exists at {}", warehouseBucket, s3Endpoint, e);
+            throw new RuntimeException("S3 bucket initialization failed for: " + warehouseBucket, e);
+        }
+    }
+
    /**
     * Ensure the namespace exists in the catalog.
     */
--- a/flink/src/main/java/com/dexorder/flink/ingestor/IngestorBroker.java
+++ b/flink/src/main/java/com/dexorder/flink/ingestor/IngestorBroker.java
@@ -0,0 +1,503 @@
+package com.dexorder.flink.ingestor;
+
+import com.dexorder.flink.zmq.ZmqChannelManager;
+import com.dexorder.proto.DataRequest;
+import com.dexorder.proto.RealtimeParams;
+import com.dexorder.proto.SubmitHistoricalRequest;
+import com.dexorder.proto.WorkComplete;
+import com.dexorder.proto.WorkHeartbeat;
+import com.dexorder.proto.WorkReject;
+import com.dexorder.proto.WorkStop;
+import com.dexorder.proto.WorkerReady;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.zeromq.ZMQ;
+
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+/**
+ * LRU-style work broker for ingestors.
+ *
+ * Ingestors connect via DEALER to the ROUTER socket on port 5567. They register with READY,
+ * are dispatched WORK messages, and respond with COMPLETE (historical) or HEARTBEAT (realtime).
+ * If a heartbeat times out the job is re-queued and dispatched to another available worker.
+ *
+ * Also receives SubmitHistoricalRequest messages forwarded by the relay on the PULL socket (5566).
+ *
+ * Message type IDs (ZMQ framing, not Kafka):
+ *   0x10  SubmitHistoricalRequest (relay → Flink via PULL, same as client wire type)
+ *   0x20  WorkerReady             (ingestor → Flink)
+ *   0x21  WorkComplete            (ingestor → Flink)
+ *   0x22  WorkHeartbeat           (ingestor → Flink)
+ *   0x23  WorkReject              (ingestor → Flink)
+ *   0x01  DataRequest/WorkAssign  (Flink → ingestor via ROUTER)
+ *   0x25  WorkStop                (Flink → ingestor via ROUTER)
+ */
+public class IngestorBroker implements AutoCloseable {
+    private static final Logger LOG = LoggerFactory.getLogger(IngestorBroker.class);
+
+    private static final byte PROTOCOL_VERSION = 0x01;
+    private static final byte MSG_TYPE_SUBMIT_REQUEST   = 0x10;
+    private static final byte MSG_TYPE_WORKER_READY     = 0x20;
+    private static final byte MSG_TYPE_WORK_COMPLETE    = 0x21;
+    private static final byte MSG_TYPE_WORK_HEARTBEAT   = 0x22;
+    private static final byte MSG_TYPE_WORK_REJECT      = 0x23;
+    private static final byte MSG_TYPE_WORK_ASSIGN      = 0x01;  // DataRequest type on wire
+    private static final byte MSG_TYPE_WORK_STOP        = 0x25;
+
+    /** Re-queue realtime job if no heartbeat received within this window (ms) */
+    private static final long HEARTBEAT_TIMEOUT_MS = 25_000;
+    /** Re-queue historical job if not completed within this window (ms) */
+    private static final long HISTORICAL_TIMEOUT_MS = 60_000;
+
+    private final ZmqChannelManager zmqManager;
+    private volatile boolean running;
+    private Thread brokerThread;
+
+    // ── Worker tracking ──────────────────────────────────────────────────────
+
+    /** Workers ready to accept a job, in LRU order (head = least recently used) */
+    private final Deque<WorkerInfo> freeWorkers = new ArrayDeque<>();
+
+    /** Jobs waiting for a compatible free worker */
+    private final Queue<DataRequest> pendingJobs = new ArrayDeque<>();
+
+    /** Jobs currently executing on a worker */
+    private final Map<String, ActiveJob> activeJobs = new ConcurrentHashMap<>();
+
+    /** Worker identity → supported exchanges (set once on READY) */
+    private final Map<String, WorkerInfo> knownWorkers = new ConcurrentHashMap<>();
+
+    // ── Thread-safe inbound queue from RealtimeSubscriptionManager ───────────
+
+    private final Queue<DataRequest> externalSubmissions = new ConcurrentLinkedQueue<>();
+
+    public IngestorBroker(ZmqChannelManager zmqManager) {
+        this.zmqManager = zmqManager;
+    }
+
+    public void start() {
+        if (running) {
+            LOG.warn("IngestorBroker already running");
+            return;
+        }
+        running = true;
+        brokerThread = new Thread(this::brokerLoop, "IngestorBroker-Thread");
+        brokerThread.setDaemon(false);
+        brokerThread.start();
+        LOG.info("IngestorBroker started");
+    }
+
+    public void stop() {
+        running = false;
+        if (brokerThread != null) {
+            brokerThread.interrupt();
+            try {
+                brokerThread.join(5000);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
+        LOG.info("IngestorBroker stopped");
+    }
+
+    /**
+     * Submit a realtime data request from outside the broker thread (thread-safe).
+     * Called by RealtimeSubscriptionManager when subscription ref count goes 0→1.
+     */
+    public void submitRealtimeRequest(String ticker) {
+        String jobId = UUID.randomUUID().toString();
+        DataRequest request = DataRequest.newBuilder()
+                .setRequestId(jobId)
+                .setJobId(jobId)
+                .setType(DataRequest.RequestType.REALTIME_TICKS)
+                .setTicker(ticker)
+                .setRealtime(RealtimeParams.newBuilder()
+                        .setIncludeTicks(true)
+                        .setIncludeOhlc(false)
+                        .build())
+                .build();
+        externalSubmissions.add(request);
+        LOG.info("Enqueued realtime request: ticker={}, jobId={}", ticker, jobId);
+    }
+
+    /**
+     * Stop all realtime jobs for a ticker (called when last subscriber leaves).
+     * Thread-safe — posts a stop marker via externalSubmissions is complex; instead we
+     * directly find and stop active jobs. Protected by ConcurrentHashMap.
+     */
+    public void stopRealtimeJobsForTicker(String ticker) {
+        List<String> toStop = new ArrayList<>();
+        for (Map.Entry<String, ActiveJob> entry : activeJobs.entrySet()) {
+            if (entry.getValue().ticker.equals(ticker) &&
+                entry.getValue().type == DataRequest.RequestType.REALTIME_TICKS) {
+                toStop.add(entry.getKey());
+            }
+        }
+        for (String jobId : toStop) {
+            ActiveJob job = activeJobs.remove(jobId);
+            if (job != null) {
+                sendStop(job.workerIdentity, jobId);
+                LOG.info("Sent STOP to ingestor: ticker={}, jobId={}", ticker, jobId);
+            }
+        }
+    }
+
+    // ── Broker loop ──────────────────────────────────────────────────────────
+
+    private void brokerLoop() {
+        ZMQ.Socket pullSocket   = zmqManager.getSocket(ZmqChannelManager.Channel.CLIENT_REQUEST);
+        ZMQ.Socket routerSocket = zmqManager.getSocket(ZmqChannelManager.Channel.INGESTOR_BROKER);
+
+        ZMQ.Poller poller = zmqManager.createPoller(2);
+        poller.register(pullSocket, ZMQ.Poller.POLLIN);
+        poller.register(routerSocket, ZMQ.Poller.POLLIN);
+
+        LOG.info("IngestorBroker loop running");
+
+        while (running) {
+            try {
+                // Drain external submissions (realtime requests from subscription manager)
+                DataRequest ext;
+                while ((ext = externalSubmissions.poll()) != null) {
+                    enqueueJob(ext);
+                }
+
+                // Poll sockets (100ms timeout)
+                poller.poll(100);
+
+                if (poller.pollin(0)) {
+                    handleClientRequest(pullSocket);
+                }
+
+                if (poller.pollin(1)) {
+                    handleWorkerMessage(routerSocket);
+                }
+
+                // Check for heartbeat / completion timeouts
+                checkTimeouts();
+
+            } catch (Exception e) {
+                if (running) {
+                    LOG.error("Error in broker loop", e);
+                }
+            }
+        }
+
+        LOG.info("IngestorBroker loop exited");
+    }
+
+    /** Receive a SubmitHistoricalRequest forwarded by relay and enqueue it. */
+    private void handleClientRequest(ZMQ.Socket pullSocket) {
+        byte[] versionFrame = pullSocket.recv(ZMQ.DONTWAIT);
+        if (versionFrame == null) return;
+        if (!pullSocket.hasReceiveMore()) return;
+        byte[] messageFrame = pullSocket.recv(0);
+        if (messageFrame == null || messageFrame.length < 2) return;
+
+        if (versionFrame.length != 1 || versionFrame[0] != PROTOCOL_VERSION) {
+            LOG.warn("Bad protocol version on PULL socket");
+            return;
+        }
+
+        byte msgType = messageFrame[0];
+        byte[] payload = Arrays.copyOfRange(messageFrame, 1, messageFrame.length);
+
+        if (msgType != MSG_TYPE_SUBMIT_REQUEST) {
+            LOG.warn("Unexpected message type on PULL socket: 0x{}", Integer.toHexString(msgType & 0xFF));
+            return;
+        }
+
+        try {
+            SubmitHistoricalRequest req = SubmitHistoricalRequest.parseFrom(payload);
+            String jobId = UUID.randomUUID().toString();
+            DataRequest dataRequest = DataRequest.newBuilder()
+                    .setRequestId(req.getRequestId())
+                    .setJobId(jobId)
+                    .setType(DataRequest.RequestType.HISTORICAL_OHLC)
+                    .setTicker(req.getTicker())
+                    .setHistorical(com.dexorder.proto.HistoricalParams.newBuilder()
+                            .setStartTime(req.getStartTime())
+                            .setEndTime(req.getEndTime())
+                            .setPeriodSeconds(req.getPeriodSeconds())
+                            .build())
+                    .setClientId(req.hasClientId() ? req.getClientId() : "")
+                    .build();
+            enqueueJob(dataRequest);
+            LOG.info("Received historical request from relay: request_id={}, ticker={}", req.getRequestId(), req.getTicker());
+        } catch (Exception e) {
+            LOG.error("Failed to parse SubmitHistoricalRequest from relay", e);
+        }
+    }
+
+    /** Receive and dispatch a message from an ingestor DEALER. */
+    private void handleWorkerMessage(ZMQ.Socket routerSocket) {
+        // ROUTER frame layout: [identity][empty][version][type+payload]
+        byte[] identity = routerSocket.recv(ZMQ.DONTWAIT);
+        if (identity == null) return;
+        if (!routerSocket.hasReceiveMore()) return;
+        routerSocket.recv(0); // empty delimiter
+        if (!routerSocket.hasReceiveMore()) return;
+        byte[] versionFrame = routerSocket.recv(0);
+        if (!routerSocket.hasReceiveMore()) return;
+        byte[] messageFrame = routerSocket.recv(0);
+
+        if (versionFrame == null || versionFrame.length != 1 || versionFrame[0] != PROTOCOL_VERSION) {
+            LOG.warn("Bad protocol version from ingestor");
+            return;
+        }
+        if (messageFrame == null || messageFrame.length < 1) return;
+
+        byte msgType = messageFrame[0];
+        byte[] payload = Arrays.copyOfRange(messageFrame, 1, messageFrame.length);
+        String identityKey = bytesToHex(identity);
+
+        try {
+            switch (msgType & 0xFF) {
+                case 0x20: handleWorkerReady(identity, identityKey, payload); break;
+                case 0x21: handleWorkComplete(identityKey, payload); break;
+                case 0x22: handleWorkHeartbeat(identityKey, payload); break;
+                case 0x23: handleWorkReject(identityKey, payload); break;
+                default:
+                    LOG.warn("Unknown message type from ingestor: 0x{}", Integer.toHexString(msgType & 0xFF));
+            }
+        } catch (Exception e) {
+            LOG.error("Error handling worker message type 0x{}", Integer.toHexString(msgType & 0xFF), e);
+        }
+    }
+
+    private void handleWorkerReady(byte[] identity, String identityKey, byte[] payload) throws Exception {
+        WorkerReady ready = WorkerReady.parseFrom(payload);
+        Set<String> exchanges = new HashSet<>(ready.getExchangesList());
+
+        WorkerInfo worker = knownWorkers.computeIfAbsent(identityKey,
+                k -> new WorkerInfo(identity, identityKey, exchanges));
+        worker.exchanges = exchanges; // update in case re-READY with different config
+        worker.identity = identity;
+
+        if (!freeWorkers.contains(worker)) {
+            freeWorkers.addLast(worker);
+        }
+        LOG.info("Ingestor READY: id={}, exchanges={}, freeWorkers={}", identityKey, exchanges, freeWorkers.size());
+
+        dispatchPending();
+    }
+
+    private void handleWorkComplete(String identityKey, byte[] payload) throws Exception {
+        WorkComplete complete = WorkComplete.parseFrom(payload);
+        String jobId = complete.getJobId();
+
+        ActiveJob job = activeJobs.remove(jobId);
+        if (job == null) {
+            LOG.warn("COMPLETE for unknown jobId={}", jobId);
+        } else {
+            LOG.info("Job COMPLETE: jobId={}, ticker={}, success={}", jobId, job.ticker, complete.getSuccess());
+        }
+
+        // Worker is free again
+        WorkerInfo worker = knownWorkers.get(identityKey);
+        if (worker != null) {
+            freeWorkers.addLast(worker);
+            dispatchPending();
+        }
+    }
+
+    private void handleWorkHeartbeat(String identityKey, byte[] payload) throws Exception {
+        WorkHeartbeat hb = WorkHeartbeat.parseFrom(payload);
+        String jobId = hb.getJobId();
+
+        ActiveJob job = activeJobs.get(jobId);
+        if (job != null) {
+            job.lastHeartbeat = System.currentTimeMillis();
+        } else {
+            LOG.warn("HEARTBEAT for unknown jobId={} from worker={}", jobId, identityKey);
+        }
+    }
+
+    private void handleWorkReject(String identityKey, byte[] payload) throws Exception {
+        WorkReject reject = WorkReject.parseFrom(payload);
+        String jobId = reject.getJobId();
+        LOG.warn("Job REJECTED by worker={}: jobId={}, reason={}", identityKey, jobId, reject.getReason());
+
+        ActiveJob job = activeJobs.remove(jobId);
+        if (job != null) {
+            // Re-queue with fresh job_id so a different ingestor may pick it up
+            DataRequest requeued = job.request.toBuilder()
+                    .setJobId(UUID.randomUUID().toString())
+                    .build();
+            pendingJobs.add(requeued);
+        }
+
+        // Worker is still free (it rejected, not crashed)
+        WorkerInfo worker = knownWorkers.get(identityKey);
+        if (worker != null) {
+            freeWorkers.addLast(worker);
+            dispatchPending();
+        }
+    }
+
+    // ── Dispatch ─────────────────────────────────────────────────────────────
+
+    private void enqueueJob(DataRequest request) {
+        // Check if we can immediately dispatch
+        WorkerInfo worker = findFreeWorker(exchangeOf(request.getTicker()));
+        if (worker != null) {
+            dispatch(worker, request);
+        } else {
+            pendingJobs.add(request);
+            LOG.debug("No free worker for {}, queued (pendingJobs={})", request.getTicker(), pendingJobs.size());
+        }
+    }
+
+    private void dispatchPending() {
+        Queue<DataRequest> remaining = new ArrayDeque<>();
+        DataRequest job;
+        while ((job = pendingJobs.poll()) != null) {
+            WorkerInfo worker = findFreeWorker(exchangeOf(job.getTicker()));
+            if (worker != null) {
+                dispatch(worker, job);
+            } else {
+                remaining.add(job);
+            }
+        }
+        pendingJobs.addAll(remaining);
+    }
+
+    private void dispatch(WorkerInfo worker, DataRequest request) {
+        freeWorkers.remove(worker);
+
+        try {
+            byte[] protoBytes = request.toByteArray();
+            boolean sent = zmqManager.sendToWorker(worker.identity, PROTOCOL_VERSION, MSG_TYPE_WORK_ASSIGN, protoBytes);
+            if (!sent) {
+                LOG.error("Failed to dispatch job to worker={}, re-queuing", worker.identityKey);
+                freeWorkers.addLast(worker);
+                pendingJobs.add(request);
+                return;
+            }
+
+            ActiveJob active = new ActiveJob(worker.identity, worker.identityKey,
+                    request, request.getTicker(), request.getType());
+            activeJobs.put(request.getJobId(), active);
+
+            LOG.info("Dispatched job: jobId={}, ticker={}, type={}, worker={}",
+                    request.getJobId(), request.getTicker(), request.getType(), worker.identityKey);
+        } catch (Exception e) {
+            LOG.error("Error dispatching job", e);
+            freeWorkers.addLast(worker);
+        }
+    }
+
+    private void sendStop(byte[] workerIdentity, String jobId) {
+        try {
+            WorkStop stop = WorkStop.newBuilder().setJobId(jobId).build();
+            zmqManager.sendToWorker(workerIdentity, PROTOCOL_VERSION, MSG_TYPE_WORK_STOP, stop.toByteArray());
+        } catch (Exception e) {
+            LOG.error("Error sending STOP for jobId={}", jobId, e);
+        }
+    }
+
+    // ── Timeout checking ─────────────────────────────────────────────────────
+
+    private void checkTimeouts() {
+        long now = System.currentTimeMillis();
+        List<String> timedOut = new ArrayList<>();
+
+        for (Map.Entry<String, ActiveJob> entry : activeJobs.entrySet()) {
+            ActiveJob job = entry.getValue();
+            long timeout = job.type == DataRequest.RequestType.REALTIME_TICKS
+                    ? HEARTBEAT_TIMEOUT_MS : HISTORICAL_TIMEOUT_MS;
+            if (now - job.lastHeartbeat > timeout) {
+                timedOut.add(entry.getKey());
+            }
+        }
+
+        for (String jobId : timedOut) {
+            ActiveJob job = activeJobs.remove(jobId);
+            if (job == null) continue;
+            LOG.warn("Job timed out (no heartbeat/completion): jobId={}, ticker={}, type={}, worker={}",
+                    jobId, job.ticker, job.type, job.workerIdentityKey);
+
+            // Re-queue with a new job_id
+            DataRequest requeued = job.request.toBuilder()
+                    .setJobId(UUID.randomUUID().toString())
+                    .build();
+            pendingJobs.add(requeued);
+            dispatchPending();
+        }
+    }
+
+    // ── Helpers ──────────────────────────────────────────────────────────────
+
+    /** Extract exchange name from ticker, e.g. "BTC/USDT.BINANCE" → "BINANCE" */
+    private static String exchangeOf(String ticker) {
+        int dot = ticker.lastIndexOf('.');
+        return dot >= 0 ? ticker.substring(dot + 1).toUpperCase() : "";
+    }
+
+    /** Find and remove a free worker that supports the given exchange. */
+    private WorkerInfo findFreeWorker(String exchange) {
+        for (WorkerInfo w : freeWorkers) {
+            if (exchange.isEmpty() || w.exchanges.contains(exchange)) {
+                freeWorkers.remove(w);
+                return w;
+            }
+        }
+        return null;
+    }
+
+    private static String bytesToHex(byte[] bytes) {
+        StringBuilder sb = new StringBuilder();
+        for (byte b : bytes) sb.append(String.format("%02x", b));
+        return sb.toString();
+    }
+
+    @Override
+    public void close() {
+        stop();
+    }
+
+    // ── Inner types ──────────────────────────────────────────────────────────
+
+    private static class WorkerInfo {
+        byte[] identity;
+        final String identityKey;
+        Set<String> exchanges;
+
+        WorkerInfo(byte[] identity, String identityKey, Set<String> exchanges) {
+            this.identity = identity;
+            this.identityKey = identityKey;
+            this.exchanges = exchanges;
+        }
+    }
+
+    private static class ActiveJob {
+        final byte[] workerIdentity;
+        final String workerIdentityKey;
+        final DataRequest request;
+        final String ticker;
+        final DataRequest.RequestType type;
+        long lastHeartbeat;
+
+        ActiveJob(byte[] workerIdentity, String workerIdentityKey,
+                  DataRequest request, String ticker, DataRequest.RequestType type) {
+            this.workerIdentity = workerIdentity;
+            this.workerIdentityKey = workerIdentityKey;
+            this.request = request;
+            this.ticker = ticker;
+            this.type = type;
+            this.lastHeartbeat = System.currentTimeMillis();
+        }
+    }
+}
--- a/flink/src/main/java/com/dexorder/flink/ingestor/IngestorWorkQueue.java
+++ b/flink/src/main/java/com/dexorder/flink/ingestor/IngestorWorkQueue.java
@@ -119,7 +119,7 @@ public class IngestorWorkQueue {
            String exchangePrefix = extractExchangePrefix(ticker);

            boolean sent = zmqManager.sendTopicMessage(
-                ZmqChannelManager.Channel.INGESTOR_WORK_QUEUE,
+                ZmqChannelManager.Channel.INGESTOR_BROKER,
                exchangePrefix,
                PROTOCOL_VERSION,
                MSG_TYPE_DATA_REQUEST,
--- a/flink/src/main/java/com/dexorder/flink/ingestor/RealtimeSubscriptionManager.java
+++ b/flink/src/main/java/com/dexorder/flink/ingestor/RealtimeSubscriptionManager.java
@@ -0,0 +1,204 @@
+package com.dexorder.flink.ingestor;
+
+import com.dexorder.flink.zmq.ZmqChannelManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.zeromq.ZMQ;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Monitors XPUB subscription events from the relay and manages realtime ingestor lifecycle.
+ *
+ * This class is the <em>sole owner</em> of the MARKET_DATA_PUB XPUB socket. All outbound
+ * publishes from other threads (e.g., HistoryNotificationForwarder, RealtimeOHLCPublisher)
+ * must go through {@link #enqueuePublish(byte[]...)} so they are sent from the single loop
+ * thread — ZMQ sockets are not thread-safe.
+ *
+ * Topic format: {@code {ticker}|ohlc:{period_seconds}}
+ * Example:      {@code BTC/USDT.BINANCE|ohlc:60}
+ *
+ * Reference counting:
+ *   tickerRefs — across all periods for a ticker; 0→1 triggers ingestor activation
+ *   topicRefs  — per (ticker, period); consulted by RealtimeOHLCPublisher to filter output
+ */
+public class RealtimeSubscriptionManager implements AutoCloseable {
+    private static final Logger LOG = LoggerFactory.getLogger(RealtimeSubscriptionManager.class);
+
+    private static final Pattern TOPIC_PATTERN = Pattern.compile("^(.+)\\|ohlc:(\\d+)$");
+
+    private final ZmqChannelManager zmqManager;
+    private final ZMQ.Socket xpubSocket;
+    private final IngestorBroker broker;
+
+    /** Per-ticker reference count (across all subscribed periods for that ticker) */
+    private final Map<String, Integer> tickerRefs = new HashMap<>();
+
+    /** Per-topic reference count (ticker|ohlc:period → subscriber count) */
+    private final Map<String, Integer> topicRefs = new HashMap<>();
+
+    /**
+     * Thread-safe outbound publish queue.
+     * Each entry is one multi-frame message: {@code byte[][] frames}.
+     */
+    private final ConcurrentLinkedQueue<byte[][]> publishQueue = new ConcurrentLinkedQueue<>();
+
+    private volatile boolean running;
+    private Thread thread;
+
+    public RealtimeSubscriptionManager(ZmqChannelManager zmqManager, IngestorBroker broker) {
+        this.zmqManager = zmqManager;
+        this.xpubSocket = zmqManager.getSocket(ZmqChannelManager.Channel.MARKET_DATA_PUB);
+        this.broker = broker;
+    }
+
+    /**
+     * Queue a multi-frame message for publication on MARKET_DATA_PUB.
+     * Thread-safe — may be called from any thread (HistoryNotificationForwarder,
+     * RealtimeOHLCPublisher, etc.).
+     */
+    public void enqueuePublish(byte[]... frames) {
+        publishQueue.add(frames);
+    }
+
+    /**
+     * Returns the current subscriber count for a topic.
+     * Thread-safe for reads (value is written only from the loop thread but read from others).
+     */
+    public int getTopicRefCount(String topic) {
+        return topicRefs.getOrDefault(topic, 0);
+    }
+
+    public void start() {
+        if (running) {
+            LOG.warn("RealtimeSubscriptionManager already running");
+            return;
+        }
+        running = true;
+        thread = new Thread(this::subscriptionLoop, "RealtimeSubscriptionManager");
+        thread.setDaemon(false);
+        thread.start();
+        LOG.info("RealtimeSubscriptionManager started");
+    }
+
+    public void stop() {
+        running = false;
+        if (thread != null) {
+            thread.interrupt();
+            try {
+                thread.join(5000);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
+        LOG.info("RealtimeSubscriptionManager stopped");
+    }
+
+    private void subscriptionLoop() {
+        // Build a poller so we can block-wait rather than busy-spin
+        ZMQ.Poller poller = zmqManager.createPoller(1);
+        poller.register(xpubSocket, ZMQ.Poller.POLLIN);
+
+        LOG.info("RealtimeSubscriptionManager loop running");
+
+        while (running) {
+            try {
+                // 1. Flush any queued outbound messages before blocking
+                byte[][] frames;
+                while ((frames = publishQueue.poll()) != null) {
+                    sendFrames(frames);
+                }
+
+                // 2. Wait up to 50ms for a subscription event
+                poller.poll(50);
+
+                // 3. Drain all available subscription events
+                if (poller.pollin(0)) {
+                    byte[] event;
+                    while ((event = xpubSocket.recv(ZMQ.DONTWAIT)) != null) {
+                        if (event.length > 0) {
+                            processSubscriptionEvent(event);
+                        }
+                    }
+                }
+            } catch (Exception e) {
+                if (running) {
+                    LOG.error("Error in subscription loop", e);
+                }
+            }
+        }
+
+        LOG.info("RealtimeSubscriptionManager loop exited");
+    }
+
+    private void sendFrames(byte[][] frames) {
+        for (int i = 0; i < frames.length; i++) {
+            if (i < frames.length - 1) {
+                xpubSocket.sendMore(frames[i]);
+            } else {
+                xpubSocket.send(frames[i], 0);
+            }
+        }
+    }
+
+    private void processSubscriptionEvent(byte[] event) {
+        // XPUB subscription frame: first byte is 0x01 (subscribe) or 0x00 (unsubscribe);
+        // remaining bytes are the raw topic string.
+        boolean isSubscribe = event[0] == 0x01;
+        String topic = new String(event, 1, event.length - 1, ZMQ.CHARSET);
+
+        Matcher m = TOPIC_PATTERN.matcher(topic);
+        if (!m.matches()) {
+            // Not a realtime OHLC topic — e.g. RESPONSE: or HISTORY_READY: prefixes
+            LOG.debug("Ignoring subscription event for non-realtime topic: action={}, topic={}",
+                    isSubscribe ? "subscribe" : "unsubscribe", topic);
+            return;
+        }
+
+        String ticker = m.group(1);
+        LOG.info("Subscription event: action={}, topic={}", isSubscribe ? "subscribe" : "unsubscribe", topic);
+
+        if (isSubscribe) {
+            handleSubscribe(ticker, topic);
+        } else {
+            handleUnsubscribe(ticker, topic);
+        }
+    }
+
+    private void handleSubscribe(String ticker, String topic) {
+        int newTopicRef = topicRefs.merge(topic, 1, Integer::sum);
+        LOG.debug("topicRefs[{}]={}", topic, newTopicRef);
+
+        int newTickerRef = tickerRefs.merge(ticker, 1, Integer::sum);
+        if (newTickerRef == 1) {
+            LOG.info("First subscriber for ticker={} — submitting realtime request", ticker);
+            broker.submitRealtimeRequest(ticker);
+        }
+        LOG.debug("tickerRefs[{}]={}", ticker, newTickerRef);
+    }
+
+    private void handleUnsubscribe(String ticker, String topic) {
+        int newTopicRef = topicRefs.merge(topic, -1, Integer::sum);
+        if (newTopicRef <= 0) {
+            topicRefs.remove(topic);
+        }
+        LOG.debug("topicRefs[{}]={}", topic, newTopicRef);
+
+        int newTickerRef = tickerRefs.merge(ticker, -1, Integer::sum);
+        if (newTickerRef <= 0) {
+            tickerRefs.remove(ticker);
+            LOG.info("Last subscriber for ticker={} left — stopping realtime jobs", ticker);
+            broker.stopRealtimeJobsForTicker(ticker);
+        }
+        LOG.debug("tickerRefs[{}]={}", ticker, newTickerRef);
+    }
+
+    @Override
+    public void close() {
+        stop();
+    }
+}
--- a/flink/src/main/java/com/dexorder/flink/publisher/HistoryNotificationForwarder.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/HistoryNotificationForwarder.java
@@ -6,14 +6,24 @@ import org.zeromq.SocketType;
 import org.zeromq.ZContext;
 import org.zeromq.ZMQ;

+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Consumer;
+
 /**
 * Runs in the job manager. Pulls notifications from task managers (via PUSH/PULL)
- * and republishes them on the MARKET_DATA_PUB socket that the relay subscribes to.
+ * and enqueues them for publication on MARKET_DATA_PUB via the provided publish callback.
+ *
+ * The publish callback must be thread-safe (e.g., RealtimeSubscriptionManager.enqueuePublish).
+ * Direct socket access is avoided here because the MARKET_DATA_PUB XPUB socket is owned
+ * exclusively by RealtimeSubscriptionManager to satisfy ZMQ's single-thread-per-socket rule.
 *
 * Flow:
 *   Task manager HistoryNotificationPublisher  →  PUSH
 *                                               ↓
- *   Job manager HistoryNotificationForwarder   PULL  →  MARKET_DATA_PUB
+ *   Job manager HistoryNotificationForwarder   PULL  →  publishCallback (queue)
+ *                                                        ↓ (RealtimeSubscriptionManager loop)
+ *                                                        MARKET_DATA_PUB
 *                                                        ↓
 *   Relay (XSUB) → Relay (XPUB) → Clients
 */
@@ -21,17 +31,17 @@ public class HistoryNotificationForwarder implements AutoCloseable {
    private static final Logger LOG = LoggerFactory.getLogger(HistoryNotificationForwarder.class);

    private final ZMQ.Socket pullSocket;
-    private final ZMQ.Socket pubSocket;
+    private final Consumer<byte[][]> publishCallback;
    private final ZContext context;
    private volatile boolean running = true;
    private Thread thread;

    /**
-     * @param pullPort   Port to bind PULL socket on (task managers connect PUSH here)
-     * @param pubSocket  Existing MARKET_DATA_PUB socket from ZmqChannelManager
+     * @param pullPort        Port to bind PULL socket on (task managers connect PUSH here)
+     * @param publishCallback Thread-safe callback to enqueue outbound multi-frame messages
     */
-    public HistoryNotificationForwarder(int pullPort, ZMQ.Socket pubSocket) {
-        this.pubSocket = pubSocket;
+    public HistoryNotificationForwarder(int pullPort, Consumer<byte[][]> publishCallback) {
+        this.publishCallback = publishCallback;
        this.context = new ZContext();
        this.pullSocket = context.createSocket(SocketType.PULL);
        this.pullSocket.setRcvHWM(10000);
@@ -53,32 +63,24 @@ public class HistoryNotificationForwarder implements AutoCloseable {
        pullSocket.setReceiveTimeOut(200); // ms, so we can check running flag

        while (running) {
-            // Receive all frames of a multi-part message and forward to PUB
            byte[] frame = pullSocket.recv(0);
            if (frame == null) {
-                continue; // timeout, check running flag
+                continue; // timeout — check running flag
            }

-            boolean more = pullSocket.hasReceiveMore();
-            if (more) {
-                pubSocket.sendMore(frame);
-            } else {
-                pubSocket.send(frame, 0);
-                continue;
-            }
+            // Collect all frames of the multi-part message, then enqueue atomically
+            List<byte[]> frames = new ArrayList<>();
+            frames.add(frame);

-            // Receive remaining frames
-            while (more) {
-                frame = pullSocket.recv(0);
-                more = pullSocket.hasReceiveMore();
-                if (more) {
-                    pubSocket.sendMore(frame);
-                } else {
-                    pubSocket.send(frame, 0);
+            while (pullSocket.hasReceiveMore()) {
+                byte[] next = pullSocket.recv(0);
+                if (next != null) {
+                    frames.add(next);
                }
            }

-            LOG.debug("Forwarded notification to MARKET_DATA_PUB");
+            publishCallback.accept(frames.toArray(new byte[0][]));
+            LOG.debug("Enqueued notification ({} frames) for MARKET_DATA_PUB", frames.size());
        }

        LOG.info("Notification forwarder loop stopped");
--- a/flink/src/main/java/com/dexorder/flink/publisher/HistoryNotificationFunction.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/HistoryNotificationFunction.java
@@ -64,8 +64,13 @@ public class HistoryNotificationFunction extends ProcessFunction<OHLCBatchWrappe
        String status = batch.getStatus();
        int rowCount = batch.getRowCount();

-        LOG.info("Processing OHLCBatch: request_id={}, status={}, rows={}",
-                requestId, status, rowCount);
+        LOG.info("Processing OHLCBatch: request_id={}, status={}, rows={}, isLastPage={}",
+                requestId, status, rowCount, batch.isLastPage());
+
+        // Intermediate pages: data is written to Iceberg but no notification yet
+        if (!batch.isLastPage()) {
+            return;
+        }

        // Determine Iceberg table name based on period
        String tableName = getIcebergTableName(ticker, periodSeconds);
--- a/flink/src/main/java/com/dexorder/flink/publisher/OHLCBatchDeserializer.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/OHLCBatchDeserializer.java
@@ -87,7 +87,8 @@ public class OHLCBatchDeserializer implements DeserializationSchema<OHLCBatchWra
                meta.getEndTime(),
                status,
                meta.hasErrorMessage() ? meta.getErrorMessage() : null,
-                rows
+                rows,
+                meta.getIsLastPage()
        );
    }

--- a/flink/src/main/java/com/dexorder/flink/publisher/OHLCBatchWrapper.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/OHLCBatchWrapper.java
@@ -19,6 +19,7 @@ public class OHLCBatchWrapper implements Serializable {
    private final String status; // OK, NOT_FOUND, ERROR
    private final String errorMessage;
    private final List<OHLCRow> rows;
+    private final boolean isLastPage;

    public OHLCBatchWrapper(
            String requestId,
@@ -29,7 +30,8 @@ public class OHLCBatchWrapper implements Serializable {
            long endTime,
            String status,
            String errorMessage,
-            List<OHLCRow> rows
+            List<OHLCRow> rows,
+            boolean isLastPage
    ) {
        this.requestId = requestId;
        this.clientId = clientId;
@@ -40,6 +42,7 @@ public class OHLCBatchWrapper implements Serializable {
        this.status = status;
        this.errorMessage = errorMessage;
        this.rows = rows;
+        this.isLastPage = isLastPage;
    }

    public String getRequestId() {
@@ -94,6 +97,10 @@ public class OHLCBatchWrapper implements Serializable {
        return "OK".equals(status);
    }

+    public boolean isLastPage() {
+        return isLastPage;
+    }
+
    @Override
    public String toString() {
        return "OHLCBatchWrapper{" +
@@ -103,6 +110,7 @@ public class OHLCBatchWrapper implements Serializable {
                ", periodSeconds=" + periodSeconds +
                ", status='" + status + '\'' +
                ", rowCount=" + getRowCount() +
+                ", isLastPage=" + isLastPage +
                '}';
    }

--- a/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBar.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBar.java
@@ -0,0 +1,73 @@
+package com.dexorder.flink.publisher;
+
+import java.io.Serializable;
+
+/**
+ * A single completed OHLC bar for a given ticker and period.
+ * Output type of RealtimeBarFunction, input type of RealtimeBarPublisher.
+ */
+public class RealtimeBar implements Serializable {
+    private static final long serialVersionUID = 1L;
+
+    private String ticker;
+    /** Period in seconds (e.g., 60, 300, 3600) */
+    private int periodSeconds;
+    /** Window start timestamp in milliseconds since epoch */
+    private long windowStartMs;
+    /** Scaled integer price values (same precision as source Tick) */
+    private long open;
+    private long high;
+    private long low;
+    private long close;
+    /** Summed base amount across ticks in this window */
+    private long volume;
+    /** Number of ticks in this window */
+    private int tickCount;
+
+    public RealtimeBar() {}
+
+    public RealtimeBar(String ticker, int periodSeconds, long windowStartMs,
+                       long open, long high, long low, long close, long volume, int tickCount) {
+        this.ticker = ticker;
+        this.periodSeconds = periodSeconds;
+        this.windowStartMs = windowStartMs;
+        this.open = open;
+        this.high = high;
+        this.low = low;
+        this.close = close;
+        this.volume = volume;
+        this.tickCount = tickCount;
+    }
+
+    public String getTicker() { return ticker; }
+    public int getPeriodSeconds() { return periodSeconds; }
+    public long getWindowStartMs() { return windowStartMs; }
+    public long getOpen() { return open; }
+    public long getHigh() { return high; }
+    public long getLow() { return low; }
+    public long getClose() { return close; }
+    public long getVolume() { return volume; }
+    public int getTickCount() { return tickCount; }
+
+    public void setTicker(String ticker) { this.ticker = ticker; }
+    public void setPeriodSeconds(int periodSeconds) { this.periodSeconds = periodSeconds; }
+    public void setWindowStartMs(long windowStartMs) { this.windowStartMs = windowStartMs; }
+    public void setOpen(long open) { this.open = open; }
+    public void setHigh(long high) { this.high = high; }
+    public void setLow(long low) { this.low = low; }
+    public void setClose(long close) { this.close = close; }
+    public void setVolume(long volume) { this.volume = volume; }
+    public void setTickCount(int tickCount) { this.tickCount = tickCount; }
+
+    /** ZMQ topic for this bar: e.g., "BTC/USDT.BINANCE|ohlc:60" */
+    public String topic() {
+        return ticker + "|ohlc:" + periodSeconds;
+    }
+
+    @Override
+    public String toString() {
+        return "RealtimeBar{ticker='" + ticker + "', period=" + periodSeconds +
+                "s, windowStart=" + windowStartMs + ", O=" + open + " H=" + high +
+                " L=" + low + " C=" + close + ", ticks=" + tickCount + '}';
+    }
+}
--- a/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBarFunction.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBarFunction.java
@@ -0,0 +1,116 @@
+package com.dexorder.flink.publisher;
+
+import org.apache.flink.api.common.functions.RichFlatMapFunction;
+import org.apache.flink.api.common.state.MapState;
+import org.apache.flink.api.common.state.MapStateDescriptor;
+import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
+import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.util.Collector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Accumulates ticks into OHLC bars for each configured period.
+ *
+ * Keyed by ticker. Maintains per-period accumulators in MapState.
+ * Uses a "lazy boundary" approach: a new window is detected when a tick arrives after
+ * the previous window's end time (based on processing clock). The completed bar is
+ * emitted immediately when the boundary is crossed, so bars are delayed by at most
+ * one tick interval (~10s for realtime polling).
+ *
+ * Periods are configurable at construction time. All configured periods are computed
+ * for every ticker receiving ticks; the ZMQ publisher filters to active subscriptions.
+ *
+ * Accumulator layout (long[7]):
+ *   [0] open
+ *   [1] high
+ *   [2] low
+ *   [3] close
+ *   [4] volume (sum of base amount)
+ *   [5] windowStartMs (epoch ms)
+ *   [6] tickCount
+ */
+public class RealtimeBarFunction extends RichFlatMapFunction<TickWrapper, RealtimeBar> {
+    private static final Logger LOG = LoggerFactory.getLogger(RealtimeBarFunction.class);
+    private static final long serialVersionUID = 1L;
+
+    private final int[] periods;
+    private transient MapState<Integer, long[]> accumState;
+
+    /**
+     * @param periods  Period lengths in seconds (e.g., 60, 300, 900, 3600)
+     */
+    public RealtimeBarFunction(int[] periods) {
+        this.periods = periods;
+    }
+
+    @Override
+    public void open(Configuration parameters) {
+        MapStateDescriptor<Integer, long[]> desc = new MapStateDescriptor<>(
+                "ohlcAccum",
+                BasicTypeInfo.INT_TYPE_INFO,
+                PrimitiveArrayTypeInfo.LONG_PRIMITIVE_ARRAY_TYPE_INFO
+        );
+        accumState = getRuntimeContext().getMapState(desc);
+    }
+
+    @Override
+    public void flatMap(TickWrapper tick, Collector<RealtimeBar> out) throws Exception {
+        if (tick == null) return;
+
+        long nowMs = System.currentTimeMillis();
+
+        for (int period : periods) {
+            long periodMs = period * 1000L;
+            long windowStart = (nowMs / periodMs) * periodMs;
+
+            long[] accum = accumState.get(period);
+
+            if (accum == null) {
+                // First tick for this period
+                accumState.put(period, openWindow(tick, windowStart));
+
+            } else if (accum[5] != windowStart) {
+                // Window boundary crossed — emit completed bar then start fresh
+                if (accum[6] > 0) {
+                    out.collect(toBar(tick.getTicker(), period, accum));
+                    LOG.debug("Emitted bar: ticker={}, period={}s, windowStart={}, ticks={}",
+                            tick.getTicker(), period, accum[5], accum[6]);
+                }
+                accumState.put(period, openWindow(tick, windowStart));
+
+            } else {
+                // Same window — update
+                accum[1] = Math.max(accum[1], tick.getPrice()); // high
+                accum[2] = Math.min(accum[2], tick.getPrice()); // low
+                accum[3] = tick.getPrice();                      // close
+                accum[4] += tick.getAmount();                    // volume
+                accum[6]++;                                       // tick count
+                accumState.put(period, accum);
+            }
+        }
+    }
+
+    private static long[] openWindow(TickWrapper tick, long windowStart) {
+        return new long[]{
+            tick.getPrice(), // open
+            tick.getPrice(), // high
+            tick.getPrice(), // low
+            tick.getPrice(), // close
+            tick.getAmount(), // volume
+            windowStart,
+            1L               // tickCount
+        };
+    }
+
+    private static RealtimeBar toBar(String ticker, int period, long[] accum) {
+        return new RealtimeBar(
+                ticker, period,
+                accum[5],           // windowStartMs
+                accum[0], accum[1], accum[2], accum[3], // O H L C
+                accum[4],           // volume
+                (int) accum[6]      // tickCount
+        );
+    }
+}
--- a/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBarPublisher.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBarPublisher.java
@@ -0,0 +1,91 @@
+package com.dexorder.flink.publisher;
+
+import com.dexorder.proto.OHLC;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.zeromq.SocketType;
+import org.zeromq.ZContext;
+import org.zeromq.ZMQ;
+
+/**
+ * Flink sink that publishes completed realtime OHLC bars to clients.
+ *
+ * Connects a ZMQ PUSH socket to the job manager's notification PULL endpoint.
+ * The HistoryNotificationForwarder (already running on the job manager) receives these
+ * frames and enqueues them to RealtimeSubscriptionManager, which publishes them on
+ * the MARKET_DATA_PUB XPUB socket. Clients subscribed to the matching topic receive the bar.
+ *
+ * Wire format (matches HistoryNotificationPublisher):
+ *   Frame 1: topic bytes  (e.g., "BTC/USDT.BINANCE|ohlc:60")
+ *   Frame 2: [0x01]       (protocol version)
+ *   Frame 3: [0x04][OHLC protobuf bytes]  (type 0x04 = OHLC single bar)
+ *
+ * Parallelism MUST be 1 (same as the rest of the notification pipeline).
+ */
+public class RealtimeBarPublisher extends RichSinkFunction<RealtimeBar> {
+    private static final Logger LOG = LoggerFactory.getLogger(RealtimeBarPublisher.class);
+    private static final long serialVersionUID = 1L;
+
+    private static final byte PROTOCOL_VERSION = 0x01;
+    private static final byte MSG_TYPE_OHLC = 0x04;
+
+    private final String jobManagerPullEndpoint;
+
+    private transient ZContext context;
+    private transient ZMQ.Socket pushSocket;
+
+    public RealtimeBarPublisher(String jobManagerPullEndpoint) {
+        this.jobManagerPullEndpoint = jobManagerPullEndpoint;
+    }
+
+    @Override
+    public void open(Configuration parameters) {
+        context = new ZContext();
+        pushSocket = context.createSocket(SocketType.PUSH);
+        pushSocket.setLinger(1000);
+        pushSocket.setSndHWM(10000);
+        pushSocket.connect(jobManagerPullEndpoint);
+        LOG.info("RealtimeBarPublisher PUSH connected to {}", jobManagerPullEndpoint);
+    }
+
+    @Override
+    public void invoke(RealtimeBar bar, Context context) {
+        try {
+            // Build OHLC proto — timestamp in nanoseconds (bar uses ms, convert)
+            OHLC ohlc = OHLC.newBuilder()
+                    .setTimestamp(bar.getWindowStartMs() * 1_000_000L) // ms → ns
+                    .setTicker(bar.getTicker())
+                    .setOpen(bar.getOpen())
+                    .setHigh(bar.getHigh())
+                    .setLow(bar.getLow())
+                    .setClose(bar.getClose())
+                    .setVolume(bar.getVolume())
+                    .build();
+
+            byte[] protoBytes = ohlc.toByteArray();
+            byte[] messageFrame = new byte[1 + protoBytes.length];
+            messageFrame[0] = MSG_TYPE_OHLC;
+            System.arraycopy(protoBytes, 0, messageFrame, 1, protoBytes.length);
+
+            String topic = bar.topic();
+            pushSocket.sendMore(topic.getBytes(ZMQ.CHARSET));
+            pushSocket.sendMore(new byte[]{PROTOCOL_VERSION});
+            pushSocket.send(messageFrame, 0);
+
+            LOG.debug("Published realtime bar: topic={}, ticks={}", topic, bar.getTickCount());
+
+        } catch (Exception e) {
+            LOG.error("Failed to publish realtime bar: ticker={}, period={}",
+                    bar.getTicker(), bar.getPeriodSeconds(), e);
+        }
+    }
+
+    @Override
+    public void close() {
+        if (pushSocket != null) pushSocket.close();
+        if (context != null) context.close();
+        LOG.info("RealtimeBarPublisher closed");
+    }
+}
--- a/flink/src/main/java/com/dexorder/flink/publisher/TickDeserializer.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/TickDeserializer.java
@@ -0,0 +1,69 @@
+package com.dexorder.flink.publisher;
+
+import com.dexorder.proto.Tick;
+import org.apache.flink.api.common.serialization.DeserializationSchema;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+/**
+ * Kafka deserializer for Tick protobuf messages from the market-tick topic.
+ *
+ * Wire format: [0x01 version][0x03 TICK type][Tick protobuf bytes]
+ */
+public class TickDeserializer implements DeserializationSchema<TickWrapper> {
+    private static final Logger LOG = LoggerFactory.getLogger(TickDeserializer.class);
+    private static final long serialVersionUID = 1L;
+
+    private static final byte PROTOCOL_VERSION = 0x01;
+    private static final byte MSG_TYPE_TICK = 0x03;
+
+    @Override
+    public TickWrapper deserialize(byte[] message) throws IOException {
+        try {
+            if (message.length < 2) {
+                throw new IOException("Message too short: " + message.length + " bytes");
+            }
+
+            if (message[0] != PROTOCOL_VERSION) {
+                throw new IOException("Unsupported protocol version: 0x" + Integer.toHexString(message[0] & 0xFF));
+            }
+
+            if (message[1] != MSG_TYPE_TICK) {
+                throw new IOException("Unexpected message type: 0x" + Integer.toHexString(message[1] & 0xFF));
+            }
+
+            byte[] payload = new byte[message.length - 2];
+            System.arraycopy(message, 2, payload, 0, payload.length);
+
+            Tick tick = Tick.parseFrom(payload);
+
+            return new TickWrapper(
+                    tick.getTicker(),
+                    tick.getTradeId(),
+                    tick.getTimestamp(),
+                    tick.getPrice(),
+                    tick.getAmount(),
+                    tick.getQuoteAmount(),
+                    tick.getTakerBuy()
+            );
+
+        } catch (Exception e) {
+            LOG.warn("Failed to deserialize Tick, skipping: {}", e.getMessage());
+            // Return null; Flink's KafkaSource skips nulls via filter
+            return null;
+        }
+    }
+
+    @Override
+    public boolean isEndOfStream(TickWrapper nextElement) {
+        return false;
+    }
+
+    @Override
+    public TypeInformation<TickWrapper> getProducedType() {
+        return TypeInformation.of(TickWrapper.class);
+    }
+}
--- a/flink/src/main/java/com/dexorder/flink/publisher/TickWrapper.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/TickWrapper.java
@@ -0,0 +1,58 @@
+package com.dexorder.flink.publisher;
+
+import java.io.Serializable;
+
+/**
+ * Flink-serializable wrapper for a single Tick.
+ * Fields mirror the Tick protobuf, using primitives to avoid proto-class serialization issues.
+ */
+public class TickWrapper implements Serializable {
+    private static final long serialVersionUID = 1L;
+
+    private String ticker;
+    private String tradeId;
+    /** Timestamp in nanoseconds since epoch */
+    private long timestamp;
+    /** Price as scaled integer */
+    private long price;
+    /** Base amount as scaled integer */
+    private long amount;
+    /** Quote amount as scaled integer */
+    private long quoteAmount;
+    private boolean takerBuy;
+
+    public TickWrapper() {}
+
+    public TickWrapper(String ticker, String tradeId, long timestamp,
+                       long price, long amount, long quoteAmount, boolean takerBuy) {
+        this.ticker = ticker;
+        this.tradeId = tradeId;
+        this.timestamp = timestamp;
+        this.price = price;
+        this.amount = amount;
+        this.quoteAmount = quoteAmount;
+        this.takerBuy = takerBuy;
+    }
+
+    public String getTicker() { return ticker; }
+    public String getTradeId() { return tradeId; }
+    public long getTimestamp() { return timestamp; }
+    public long getPrice() { return price; }
+    public long getAmount() { return amount; }
+    public long getQuoteAmount() { return quoteAmount; }
+    public boolean isTakerBuy() { return takerBuy; }
+
+    public void setTicker(String ticker) { this.ticker = ticker; }
+    public void setTradeId(String tradeId) { this.tradeId = tradeId; }
+    public void setTimestamp(long timestamp) { this.timestamp = timestamp; }
+    public void setPrice(long price) { this.price = price; }
+    public void setAmount(long amount) { this.amount = amount; }
+    public void setQuoteAmount(long quoteAmount) { this.quoteAmount = quoteAmount; }
+    public void setTakerBuy(boolean takerBuy) { this.takerBuy = takerBuy; }
+
+    @Override
+    public String toString() {
+        return "TickWrapper{ticker='" + ticker + "', tradeId='" + tradeId +
+                "', timestamp=" + timestamp + ", price=" + price + '}';
+    }
+}
--- a/flink/src/main/java/com/dexorder/flink/zmq/ZmqChannelManager.java
+++ b/flink/src/main/java/com/dexorder/flink/zmq/ZmqChannelManager.java
@@ -13,7 +13,14 @@ import java.util.Map;

 /**
 * Manages all ZeroMQ channels for the Flink application.
- * Each channel is bound to a specific port and socket type.
+ *
+ * Port layout:
+ *   5558  XPUB  MARKET_DATA_PUB   — market data + notifications to clients (via relay XSUB)
+ *                                   XPUB exposes subscription frames so Flink can detect
+ *                                   which realtime topics clients are interested in.
+ *   5561  PULL  (internal)         — task manager → job manager notifications (unchanged)
+ *   5566  PULL  CLIENT_REQUEST     — receives forwarded SubmitHistoricalRequest from relay PUSH
+ *   5567  ROUTER INGESTOR_BROKER   — exclusive work queue; ingestors connect with DEALER
 */
 public class ZmqChannelManager implements Closeable {
    private static final Logger LOG = LoggerFactory.getLogger(ZmqChannelManager.class);
@@ -23,8 +30,9 @@ public class ZmqChannelManager implements Closeable {
    private final AppConfig config;

    public enum Channel {
-        INGESTOR_WORK_QUEUE,
        MARKET_DATA_PUB,
+        CLIENT_REQUEST,
+        INGESTOR_BROKER,
    }

    public ZmqChannelManager(AppConfig config) {
@@ -41,20 +49,33 @@ public class ZmqChannelManager implements Closeable {

        LOG.info("Initializing ZeroMQ channels on {}", bindAddress);

-        // 1. Ingestor Work Queue - PUB socket for topic-based work distribution (exchange prefix filtering)
+        // 1. Market Data Publication — XPUB so subscription events are visible to Flink
+        //    Relay's XSUB connects here to proxy data to clients.
+        //    Subscription frames from relay (forwarded from clients) arrive as readable messages.
+        ZMQ.Socket marketDataSocket = context.createSocket(SocketType.XPUB);
+        marketDataSocket.setXpubVerbose(true);   // emit every sub/unsub, not just first/last
+        marketDataSocket.setLinger(1000);
+        marketDataSocket.setSndHWM(10000);
+        marketDataSocket.setRcvHWM(10000);
+        String marketDataEndpoint = bindAddress + ":" + config.getMarketDataPubPort();
+        marketDataSocket.bind(marketDataEndpoint);
+        sockets.put(Channel.MARKET_DATA_PUB.name(), marketDataSocket);
+        LOG.info("Bound Market Data Publication (XPUB) to {}", marketDataEndpoint);
+
+        // 2. Client Request Pull — receives SubmitHistoricalRequest forwarded by relay PUSH
        createAndBind(
-            Channel.INGESTOR_WORK_QUEUE,
-            SocketType.PUB,
-            bindAddress + ":" + config.getIngestorWorkQueuePort(),
-            "Ingestor Work Queue (PUB)"
+            Channel.CLIENT_REQUEST,
+            SocketType.PULL,
+            bindAddress + ":" + config.getFlinkRequestPullPort(),
+            "Client Request (PULL)"
        );

-        // 2. Market Data Publication - PUB socket for market data streaming and HistoryReadyNotification
+        // 3. Ingestor Broker — ROUTER for exclusive work dispatch to ingestor DEALER workers
        createAndBind(
-            Channel.MARKET_DATA_PUB,
-            SocketType.PUB,
-            bindAddress + ":" + config.getMarketDataPubPort(),
-            "Market Data Publication (PUB)"
+            Channel.INGESTOR_BROKER,
+            SocketType.ROUTER,
+            bindAddress + ":" + config.getIngestorBrokerPort(),
+            "Ingestor Broker (ROUTER)"
        );

        LOG.info("All ZeroMQ channels initialized successfully");
@@ -63,15 +84,10 @@ public class ZmqChannelManager implements Closeable {
    private void createAndBind(Channel channel, SocketType socketType, String endpoint, String description) {
        try {
            ZMQ.Socket socket = context.createSocket(socketType);
-
-            // Set socket options
-            socket.setLinger(1000); // 1 second linger on close
-            socket.setSndHWM(10000); // High water mark for outbound messages
-            socket.setRcvHWM(10000); // High water mark for inbound messages
-
-            // Bind the socket
+            socket.setLinger(1000);
+            socket.setSndHWM(10000);
+            socket.setRcvHWM(10000);
            socket.bind(endpoint);
-
            sockets.put(channel.name(), socket);
            LOG.info("Bound {} to {}", description, endpoint);
        } catch (Exception e) {
@@ -80,6 +96,13 @@ public class ZmqChannelManager implements Closeable {
        }
    }

+    /**
+     * Create a ZMQ Poller backed by this manager's context.
+     */
+    public ZMQ.Poller createPoller(int size) {
+        return context.getContext().poller(size);
+    }
+
    /**
     * Get a socket by channel type.
     */
@@ -92,18 +115,11 @@ public class ZmqChannelManager implements Closeable {
    }

    /**
-     * Send a message on the specified channel.
-     *
-     * @param channel The channel to send on
-     * @param versionByte Protocol version byte
-     * @param messageTypeByte Message type ID byte
-     * @param protobufData Serialized protobuf message
-     * @return true if sent successfully
+     * Send a message on a channel (no topic prefix — for PULL/PUSH or direct sends).
     */
    public boolean sendMessage(Channel channel, byte versionByte, byte messageTypeByte, byte[] protobufData) {
        ZMQ.Socket socket = getSocket(channel);

-        // Send as two frames: [version byte] [type byte + protobuf data]
        boolean sentFrame1 = socket.send(new byte[]{versionByte}, ZMQ.SNDMORE);
        if (!sentFrame1) {
            LOG.error("Failed to send version frame on channel {}", channel);
@@ -124,27 +140,18 @@ public class ZmqChannelManager implements Closeable {
    }

    /**
-     * Send a message with a topic prefix (for PUB sockets).
-     *
-     * @param channel The channel to send on
-     * @param topic Topic string for subscription filtering
-     * @param versionByte Protocol version byte
-     * @param messageTypeByte Message type ID byte
-     * @param protobufData Serialized protobuf message
-     * @return true if sent successfully
+     * Send a topic-prefixed message (for XPUB market data publishing).
+     * Frame layout: [topic][version][type+payload]
     */
    public boolean sendTopicMessage(Channel channel, String topic, byte versionByte, byte messageTypeByte, byte[] protobufData) {
        ZMQ.Socket socket = getSocket(channel);

-        // Send as three frames: [topic] [version byte] [type byte + protobuf data]
-        boolean sentTopic = socket.send(topic.getBytes(ZMQ.CHARSET), ZMQ.SNDMORE);
-        if (!sentTopic) {
+        if (!socket.send(topic.getBytes(ZMQ.CHARSET), ZMQ.SNDMORE)) {
            LOG.error("Failed to send topic frame on channel {}", channel);
            return false;
        }

-        boolean sentFrame1 = socket.send(new byte[]{versionByte}, ZMQ.SNDMORE);
-        if (!sentFrame1) {
+        if (!socket.send(new byte[]{versionByte}, ZMQ.SNDMORE)) {
            LOG.error("Failed to send version frame on channel {}", channel);
            return false;
        }
@@ -153,8 +160,7 @@ public class ZmqChannelManager implements Closeable {
        frame2[0] = messageTypeByte;
        System.arraycopy(protobufData, 0, frame2, 1, protobufData.length);

-        boolean sentFrame2 = socket.send(frame2, 0);
-        if (!sentFrame2) {
+        if (!socket.send(frame2, 0)) {
            LOG.error("Failed to send message frame on channel {}", channel);
            return false;
        }
@@ -162,6 +168,24 @@ public class ZmqChannelManager implements Closeable {
        return true;
    }

+    /**
+     * Send a targeted message to a specific DEALER worker via ROUTER.
+     * Frame layout: [identity][empty][version][type+payload]
+     */
+    public boolean sendToWorker(byte[] identity, byte versionByte, byte messageTypeByte, byte[] protobufData) {
+        ZMQ.Socket socket = getSocket(Channel.INGESTOR_BROKER);
+
+        if (!socket.send(identity, ZMQ.SNDMORE)) return false;
+        if (!socket.send(new byte[0], ZMQ.SNDMORE)) return false;
+        if (!socket.send(new byte[]{versionByte}, ZMQ.SNDMORE)) return false;
+
+        byte[] frame = new byte[1 + protobufData.length];
+        frame[0] = messageTypeByte;
+        System.arraycopy(protobufData, 0, frame, 1, protobufData.length);
+
+        return socket.send(frame, 0);
+    }
+
    @Override
    public void close() {
        LOG.info("Closing ZeroMQ channels");
--- a/gateway/src/auth/authenticator.ts
+++ b/gateway/src/auth/authenticator.ts
@@ -72,7 +72,7 @@ export class Authenticator {
        );
      }

-      const sessionId = `ws_${userId}_${Date.now()}`;
+      const sessionId = `ws_${userId}`;

      return {
        authContext: {
--- a/gateway/src/channels/websocket-handler.ts
+++ b/gateway/src/channels/websocket-handler.ts
@@ -2,12 +2,14 @@ import type { FastifyInstance, FastifyRequest } from 'fastify';
 import type { WebSocket } from '@fastify/websocket';
 import type { Authenticator } from '../auth/authenticator.js';
 import type { AgentHarness, HarnessFactory } from '../harness/agent-harness.js';
+import type { HarnessEvent } from '../harness/harness-events.js';
 import type { InboundMessage } from '../types/messages.js';
 import { randomUUID } from 'crypto';
 import type { SessionRegistry, EventSubscriber, Session } from '../events/index.js';
-import type { OHLCService } from '../services/ohlc-service.js';
+import type { OHLCService, BarUpdateCallback } from '../services/ohlc-service.js';
 import type { SymbolIndexService } from '../services/symbol-index-service.js';
 import type { ContainerManager } from '../k8s/container-manager.js';
+import type { ConversationService } from '../services/conversation-service.js';
 import {
  WorkspaceManager,
  ContainerSync,
@@ -42,6 +44,7 @@ export interface WebSocketHandlerConfig {
  createHarness: HarnessFactory;
  ohlcService?: OHLCService;  // Optional for historical data support
  symbolIndexService?: SymbolIndexService;  // Optional for symbol search
+  conversationService?: ConversationService;  // Optional for history replay on reconnect
 }

 /**
@@ -50,10 +53,18 @@ export interface WebSocketHandlerConfig {
 * Handles WebSocket connections for chat and integrates with the event system
 * for container-to-client notifications.
 */
+interface BarSubscription {
+  ticker: string;
+  periodSeconds: number;
+  callback: BarUpdateCallback;
+}
+
 export class WebSocketHandler {
  private config: WebSocketHandlerConfig;
  private harnesses = new Map<string, AgentHarness>();
  private workspaces = new Map<string, WorkspaceManager>();
+  /** Per-session realtime bar subscriptions for cleanup on disconnect */
+  private barSubscriptions = new Map<string, BarSubscription[]>();

  constructor(config: WebSocketHandlerConfig) {
    this.config = config;
@@ -106,17 +117,22 @@ export class WebSocketHandler {

    // If container is spinning up, wait for it to be ready before continuing
    if (isSpinningUp) {
-      sendStatus(socket, 'spinning_up', 'Your workspace is starting up, please wait...');
+      sendStatus(socket, 'spinning_up', 'Your personal agent is starting up, please wait...');
+
+      const startupPingInterval = setInterval(() => {
+        if (socket.readyState === 1) socket.ping();
+      }, 10000);

      const ready = await this.config.containerManager.waitForContainerReady(authContext.userId, 120000);
+      clearInterval(startupPingInterval);
      if (!ready) {
-        logger.warn({ userId: authContext.userId }, 'Container failed to become ready within timeout');
-        socket.send(JSON.stringify({ type: 'error', message: 'Workspace failed to start. Please try again later.' }));
+        logger.warn({ userId: authContext.userId }, 'Sandbox failed to become ready within timeout');
+        socket.send(JSON.stringify({ type: 'error', message: 'Agent workspace failed to start. Please try again later.' }));
        socket.close(1011, 'Container startup timeout');
        return;
      }

-      logger.info({ userId: authContext.userId }, 'Container is ready, proceeding with session setup');
+      logger.info({ userId: authContext.userId }, 'Sandbox is ready, proceeding with session setup');
    }

    sendStatus(socket, 'initializing', 'Starting your workspace...');
@@ -241,6 +257,17 @@ export class WebSocketHandler {
        })
      );

+      // Replay conversation history so the UI pre-populates on reconnect
+      if (this.config.conversationService) {
+        const history = await this.config.conversationService.getHistory(
+          authContext.userId,
+          authContext.sessionId
+        );
+        if (history.length > 0) {
+          socket.send(JSON.stringify({ type: 'conversation_history', messages: history }));
+        }
+      }
+
      // Handle messages
      socket.on('message', async (data: Buffer) => {
        try {
@@ -266,15 +293,45 @@ export class WebSocketHandler {
              return;
            }

-            // Chunks are streamed via channelAdapter.sendChunk() during handleMessage
            try {
              // Acknowledge receipt immediately so the client can show the seen indicator
              socket.send(JSON.stringify({ type: 'agent_chunk', content: '', done: false }));

-              logger.info('Calling harness.handleMessage');
-              await harness.handleMessage(inboundMessage);
+              logger.info('Streaming harness response');
+              let fatalError = false;
+              for await (const event of harness.streamMessage(inboundMessage)) {
+                const e = event as HarnessEvent;
+                switch (e.type) {
+                  case 'chunk':
+                    socket.send(JSON.stringify({ type: 'agent_chunk', content: e.content, done: false }));
+                    break;
+                  case 'tool_call':
+                    socket.send(JSON.stringify({ type: 'agent_tool_call', toolName: e.toolName, label: e.label }));
+                    break;
+                  case 'subagent_tool_call':
+                    socket.send(JSON.stringify({ type: 'subagent_tool_call', agentName: e.agentName, toolName: e.toolName, label: e.label }));
+                    break;
+                  case 'subagent_chunk':
+                    socket.send(JSON.stringify({ type: 'subagent_chunk', agentName: e.agentName, content: e.content }));
+                    break;
+                  case 'image':
+                    socket.send(JSON.stringify({ type: 'image', data: e.data, mimeType: e.mimeType, caption: e.caption }));
+                    break;
+                  case 'error':
+                    socket.send(JSON.stringify({ type: 'text', text: `An unrecoverable error occurred in the ${e.source}.` }));
+                    if (e.fatal) fatalError = true;
+                    break;
+                  case 'done':
+                    break;
+                }
+              }

-              // Send done marker after all chunks have been streamed
+              if (fatalError) {
+                socket.close(1011, 'Fatal error');
+                return;
+              }
+
+              // Send done marker after all events have been streamed
              logger.debug('Sending done marker to client');
              socket.send(
                JSON.stringify({
@@ -332,6 +389,17 @@ export class WebSocketHandler {
          await this.config.eventSubscriber.onSessionDisconnect(removedSession);
        }

+        // Cleanup realtime bar subscriptions
+        const sessionId = authContext.sessionId;
+        const subs = this.barSubscriptions.get(sessionId);
+        if (subs && this.config.ohlcService) {
+          for (const { ticker, periodSeconds, callback } of subs) {
+            this.config.ohlcService.unsubscribeFromTicker(ticker, periodSeconds, callback);
+          }
+          this.barSubscriptions.delete(sessionId);
+          logger.info({ sessionId, count: subs.length }, 'Cleaned up realtime bar subscriptions');
+        }
+
        // Cleanup workspace
        await workspace!.shutdown();
        this.workspaces.delete(authContext.sessionId);
@@ -356,6 +424,7 @@ export class WebSocketHandler {
      }, 30000);
    } catch (error) {
      logger.error({ error }, 'Failed to initialize session');
+      socket.send(JSON.stringify({ type: 'text', text: 'An unrecoverable error occurred in the agent harness.' }));
      socket.close(1011, 'Internal server error');
      if (workspace) {
        await workspace.shutdown();
@@ -527,19 +596,92 @@ export class WebSocketHandler {
          break;
        }

-        case 'subscribe_bars':
-        case 'unsubscribe_bars':
-          // TODO: Implement real-time subscriptions
-          socket.send(
-            JSON.stringify({
-              type: `${payload.type}_response`,
+        case 'subscribe_bars': {
+          if (!ohlcService || !authContext) {
+            socket.send(JSON.stringify({
+              type: 'subscribe_bars_response',
              request_id: requestId,
              subscription_id: payload.subscription_id,
              success: false,
-              message: 'Real-time subscriptions not yet implemented',
-            })
-          );
+              message: 'Realtime service not available',
+            }));
+            break;
+          }
+
+          const subTicker: string = payload.symbol;
+          const subPeriod: number = payload.period_seconds ?? payload.resolution ?? 60;
+          const sessionId = authContext.sessionId;
+
+          // Create a per-subscription callback that forwards bars to this socket
+          const barCallback: BarUpdateCallback = (bar) => {
+            if (socket.readyState !== 1 /* OPEN */) return;
+            socket.send(JSON.stringify({
+              type: 'bar_update',
+              subscription_id: payload.subscription_id,
+              ticker: bar.ticker,
+              period_seconds: bar.periodSeconds,
+              bar: {
+                // Convert nanoseconds → seconds for client compatibility
+                time: Number(bar.timestamp / 1_000_000_000n),
+                open: bar.open,
+                high: bar.high,
+                low: bar.low,
+                close: bar.close,
+                volume: bar.volume,
+              },
+            }));
+          };
+
+          ohlcService.subscribeToTicker(subTicker, subPeriod, barCallback);
+
+          // Track for cleanup on disconnect
+          if (!this.barSubscriptions.has(sessionId)) {
+            this.barSubscriptions.set(sessionId, []);
+          }
+          this.barSubscriptions.get(sessionId)!.push({
+            ticker: subTicker,
+            periodSeconds: subPeriod,
+            callback: barCallback,
+          });
+
+          logger.info({ sessionId, ticker: subTicker, period: subPeriod }, 'Subscribed to realtime bars');
+
+          socket.send(JSON.stringify({
+            type: 'subscribe_bars_response',
+            request_id: requestId,
+            subscription_id: payload.subscription_id,
+            success: true,
+          }));
          break;
+        }
+
+        case 'unsubscribe_bars': {
+          if (!ohlcService || !authContext) break;
+
+          const unsubTicker: string = payload.symbol;
+          const unsubPeriod: number = payload.period_seconds ?? payload.resolution ?? 60;
+          const sessionId = authContext.sessionId;
+
+          const subs = this.barSubscriptions.get(sessionId);
+          if (subs) {
+            const idx = subs.findIndex(
+              s => s.ticker === unsubTicker && s.periodSeconds === unsubPeriod
+            );
+            if (idx >= 0) {
+              const [removed] = subs.splice(idx, 1);
+              ohlcService.unsubscribeFromTicker(unsubTicker, unsubPeriod, removed.callback);
+              logger.info({ sessionId, ticker: unsubTicker, period: unsubPeriod }, 'Unsubscribed from realtime bars');
+            }
+          }
+
+          socket.send(JSON.stringify({
+            type: 'unsubscribe_bars_response',
+            request_id: requestId,
+            subscription_id: payload.subscription_id,
+            success: true,
+          }));
+          break;
+        }

        case 'evaluate_indicator': {
          // Direct MCP call — bypasses the agent/LLM for performance
--- a/gateway/src/clients/duckdb-client.ts
+++ b/gateway/src/clients/duckdb-client.ts
@@ -632,6 +632,118 @@ export class DuckDBClient {
    }
  }

+  /**
+   * Append a batch of image/audio blobs as a Parquet file in S3.
+   * Called once per assistant turn that produces binary output.
+   */
+  async appendBlobs(
+    userId: string,
+    sessionId: string,
+    messageId: string,
+    blobs: Array<{
+      id: string;
+      user_id: string;
+      session_id: string;
+      message_id: string;
+      blob_type: string;
+      mime_type: string;
+      data: string;
+      caption: string | null;
+      timestamp: number;
+    }>
+  ): Promise<void> {
+    await this.initialize();
+
+    if (!this.conversationsBucket || blobs.length === 0) {
+      return;
+    }
+
+    const now = new Date();
+    const year = now.getUTCFullYear();
+    const month = String(now.getUTCMonth() + 1).padStart(2, '0');
+    const s3Path = `s3://${this.conversationsBucket}/gateway/blobs/year=${year}/month=${month}/user_id=${userId}/${sessionId}_${messageId}.parquet`;
+    const tempTable = `blob_flush_${Date.now()}`;
+
+    try {
+      await this.query(`
+        CREATE TEMP TABLE ${tempTable} (
+          id VARCHAR,
+          user_id VARCHAR,
+          session_id VARCHAR,
+          message_id VARCHAR,
+          blob_type VARCHAR,
+          mime_type VARCHAR,
+          data VARCHAR,
+          caption VARCHAR,
+          timestamp BIGINT
+        )
+      `);
+
+      for (const blob of blobs) {
+        await this.query(
+          `INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+          [blob.id, blob.user_id, blob.session_id, blob.message_id, blob.blob_type, blob.mime_type, blob.data, blob.caption, blob.timestamp]
+        );
+      }
+
+      await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
+      this.logger.info({ userId, sessionId, messageId, count: blobs.length, s3Path }, 'Blobs flushed to Parquet');
+    } finally {
+      await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
+    }
+  }
+
+  /**
+   * Query blobs from S3 by userId/sessionId, optionally filtered to specific blob IDs.
+   */
+  async queryBlobs(
+    userId: string,
+    sessionId: string,
+    blobIds?: string[]
+  ): Promise<any[]> {
+    await this.initialize();
+
+    try {
+      const tablePath = await this.getTablePath(this.namespace, 'blobs', this.catalogUri);
+
+      if (!tablePath) {
+        // Fallback: scan per-turn Parquet files written directly to S3
+        if (this.conversationsBucket) {
+          this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning blob Parquet files');
+          const parquetPath = `s3://${this.conversationsBucket}/gateway/blobs/**/user_id=${userId}/${sessionId}_*.parquet`;
+          const idClause = blobIds?.length
+            ? `WHERE id IN (${blobIds.map(id => `'${id.replace(/'/g, "''")}'`).join(', ')})`
+            : '';
+          try {
+            return await this.query(`SELECT * FROM read_parquet('${parquetPath}') ${idClause} ORDER BY timestamp ASC`);
+          } catch {
+            // No blobs yet for this session
+          }
+        }
+        return [];
+      }
+
+      const idFilter = blobIds?.length
+        ? `AND id IN (${blobIds.map(() => '?').join(', ')})`
+        : '';
+      const params: any[] = [userId, sessionId, ...(blobIds ?? [])];
+
+      const sql = `
+        SELECT id, user_id, session_id, message_id, blob_type, mime_type, data, caption, timestamp
+        FROM iceberg_scan('${tablePath}')
+        WHERE user_id = ? AND session_id = ? ${idFilter}
+        ORDER BY timestamp ASC
+      `;
+
+      const rows = await this.query(sql, params);
+      this.logger.info({ userId, sessionId, count: rows.length }, 'Loaded blobs from Iceberg');
+      return rows.map((row: any) => ({ ...row, timestamp: Number(row.timestamp) }));
+    } catch (error: any) {
+      this.logger.error({ error: error.message, userId, sessionId }, 'Failed to query blobs');
+      return [];
+    }
+  }
+
  /**
   * Close the DuckDB connection
   */
--- a/gateway/src/clients/iceberg-client.ts
+++ b/gateway/src/clients/iceberg-client.ts
@@ -45,6 +45,21 @@ export interface IcebergMessage {
  timestamp: number; // nanoseconds
 }

+/**
+ * Blob record for Iceberg storage (images, audio, etc.)
+ */
+export interface IcebergBlob {
+  id: string;
+  user_id: string;
+  session_id: string;
+  message_id: string;
+  blob_type: string;
+  mime_type: string;
+  data: string; // base64
+  caption: string | null;
+  timestamp: number; // microseconds
+}
+
 /**
 * Checkpoint record for Iceberg storage
 */
@@ -153,6 +168,25 @@ export class IcebergClient {
    return this.duckdb.appendMessages(userId, sessionId, messages);
  }

+  /**
+   * Append blobs for one assistant turn as a Parquet file in S3.
+   */
+  async appendBlobs(
+    userId: string,
+    sessionId: string,
+    messageId: string,
+    blobs: IcebergBlob[]
+  ): Promise<void> {
+    return this.duckdb.appendBlobs(userId, sessionId, messageId, blobs);
+  }
+
+  /**
+   * Query blobs from S3/Iceberg, optionally filtered to specific blob IDs.
+   */
+  async queryBlobs(userId: string, sessionId: string, blobIds?: string[]): Promise<IcebergBlob[]> {
+    return this.duckdb.queryBlobs(userId, sessionId, blobIds);
+  }
+
  /**
   * Get table metadata
   */
--- a/gateway/src/clients/qdrant-client.ts
+++ b/gateway/src/clients/qdrant-client.ts
@@ -298,6 +298,13 @@ export class QdrantClient {
        pointsCount: info.points_count || 0,
      };
    } catch (error) {
+      // If the collection was lost (e.g. Qdrant restarted without the gateway restarting),
+      // recreate it and return zeroed stats rather than propagating the error.
+      if ((error as any)?.status === 404) {
+        this.logger.warn({ collection: this.collectionName }, 'Collection missing, recreating...');
+        await this.initialize();
+        return { vectorsCount: 0, indexedVectorsCount: 0, pointsCount: 0 };
+      }
      this.logger.error({ error }, 'Failed to get collection info');
      throw error;
    }
--- a/gateway/src/clients/zmq-protocol.ts
+++ b/gateway/src/clients/zmq-protocol.ts
@@ -20,6 +20,22 @@ import type {
  NotificationStatus,
 } from '../types/ohlc.js';

+export const OHLC_BAR_TOPIC_PATTERN = /^(.+)\|ohlc:(\d+)$/;
+
+/** Decoded realtime OHLC bar received from the XPUB market data stream */
+export interface RealtimeBar {
+  topic: string;       // e.g., "BTC/USDT.BINANCE|ohlc:60"
+  ticker: string;      // e.g., "BTC/USDT.BINANCE"
+  periodSeconds: number;
+  /** Window open time in nanoseconds since epoch */
+  timestamp: bigint;
+  open: number;
+  high: number;
+  low: number;
+  close: number;
+  volume: number;
+}
+
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);

@@ -39,14 +55,17 @@ export enum MessageType {
 const protoDir = join(__dirname, '../..', 'protobuf');
 const root = new protobuf.Root();

-// Load proto file and parse it
+// Load proto files
 const ingestorProto = readFileSync(join(protoDir, 'ingestor.proto'), 'utf8');
+const ohlcProto = readFileSync(join(protoDir, 'ohlc.proto'), 'utf8');
 protobuf.parse(ingestorProto, root);
+protobuf.parse(ohlcProto, root);

 // Export message types
 const SubmitHistoricalRequestType = root.lookupType('SubmitHistoricalRequest');
 const SubmitResponseType = root.lookupType('SubmitResponse');
 const HistoryReadyNotificationType = root.lookupType('HistoryReadyNotification');
+const OHLCType = root.lookupType('OHLC');

 /**
 * Encode SubmitHistoricalRequest to ZMQ frames
@@ -178,3 +197,39 @@ export function decodeHistoryReadyNotification(frames: Buffer[]): HistoryReadyNo
    completed_at: BigInt(payload.completedAt),
  };
 }
+
+/**
+ * Decode a realtime OHLC bar from ZMQ SUB frames.
+ * Frame layout: [topic][version][0x04 OHLC type + OHLC protobuf bytes]
+ *
+ * Returns null if the topic doesn't match the realtime bar pattern or decoding fails.
+ */
+export function decodeRealtimeBar(frames: Buffer[]): RealtimeBar | null {
+  if (frames.length < 3) return null;
+
+  const topic = frames[0].toString();
+  const match = OHLC_BAR_TOPIC_PATTERN.exec(topic);
+  if (!match) return null;
+
+  const ticker = match[1];
+  const periodSeconds = parseInt(match[2], 10);
+
+  const messageFrame = frames[2];
+  if (messageFrame[0] !== 0x04) return null; // Must be OHLC type
+
+  const payloadBuffer = messageFrame.slice(1);
+  const decoded = OHLCType.decode(payloadBuffer);
+  const ohlc = OHLCType.toObject(decoded, { longs: String, defaults: true });
+
+  return {
+    topic,
+    ticker,
+    periodSeconds,
+    timestamp: BigInt(ohlc.timestamp ?? '0'),
+    open: Number(ohlc.open ?? 0),
+    high: Number(ohlc.high ?? 0),
+    low: Number(ohlc.low ?? 0),
+    close: Number(ohlc.close ?? 0),
+    volume: Number(ohlc.volume ?? 0),
+  };
+}
--- a/gateway/src/clients/zmq-relay-client.ts
+++ b/gateway/src/clients/zmq-relay-client.ts
@@ -17,6 +17,9 @@ import {
  encodeSubmitHistoricalRequest,
  decodeSubmitResponse,
  decodeHistoryReadyNotification,
+  decodeRealtimeBar,
+  OHLC_BAR_TOPIC_PATTERN,
+  type RealtimeBar,
 } from './zmq-protocol.js';
 import type {
  SubmitHistoricalRequest,
@@ -27,6 +30,9 @@ import {
  NotificationStatus,
 } from '../types/ohlc.js';

+export type BarUpdateCallback = (bar: RealtimeBar) => void;
+export type { RealtimeBar };
+
 export interface ZMQRelayConfig {
  relayRequestEndpoint: string;       // e.g., "tcp://relay:5559"
  relayNotificationEndpoint: string;  // e.g., "tcp://relay:5558"
@@ -57,6 +63,12 @@ export class ZMQRelayClient {
  private notificationTopic: string;
  private pendingRequests: Map<string, PendingRequest> = new Map();

+  /** Ref count per ZMQ topic (gateway-level dedup before ZMQ subscribe/unsubscribe) */
+  private topicRefs: Map<string, number> = new Map();
+
+  /** Callbacks registered by WebSocket sessions for realtime bar updates */
+  private barCallbacks: Map<string, Set<BarUpdateCallback>> = new Map();
+
  private connected = false;
  private notificationListenerRunning = false;

@@ -253,8 +265,6 @@ export class ZMQRelayClient {
            // Handle metadata update notifications
            if (topic === 'METADATA_UPDATE') {
              this.logger.info('Received METADATA_UPDATE notification');
-
-              // Call the onMetadataUpdate callback if configured
              if (this.config.onMetadataUpdate) {
                try {
                  await this.config.onMetadataUpdate();
@@ -265,6 +275,20 @@ export class ZMQRelayClient {
              continue;
            }

+            // Handle realtime OHLC bar updates (topic pattern: "{ticker}|ohlc:{period}")
+            if (OHLC_BAR_TOPIC_PATTERN.test(topic)) {
+              const bar = decodeRealtimeBar(Array.from(frames));
+              if (bar) {
+                const callbacks = this.barCallbacks.get(topic);
+                if (callbacks) {
+                  for (const cb of callbacks) {
+                    try { cb(bar); } catch (e) { /* ignore callback errors */ }
+                  }
+                }
+              }
+              continue;
+            }
+
            // Handle history ready notifications
            const notification = decodeHistoryReadyNotification(Array.from(frames));

@@ -308,6 +332,69 @@ export class ZMQRelayClient {
    this.logger.debug('Notification listener started');
  }

+  /**
+   * Subscribe to realtime OHLC bars for a ticker+period.
+   *
+   * ZMQ subscribe is only called on the 0→1 transition (first subscriber).
+   * This triggers the relay XPUB → Flink subscription detection → ingestor activation.
+   *
+   * @param callback  Called whenever a new bar arrives for this topic
+   */
+  subscribeToTicker(ticker: string, periodSeconds: number, callback: BarUpdateCallback): void {
+    const topic = `${ticker}|ohlc:${periodSeconds}`;
+
+    // Register callback
+    if (!this.barCallbacks.has(topic)) {
+      this.barCallbacks.set(topic, new Set());
+    }
+    this.barCallbacks.get(topic)!.add(callback);
+
+    // ZMQ subscribe on first ref
+    const prev = this.topicRefs.get(topic) ?? 0;
+    this.topicRefs.set(topic, prev + 1);
+    if (prev === 0 && this.subSocket) {
+      this.subSocket.subscribe(topic);
+      this.logger.info({ topic }, 'ZMQ subscribed to realtime topic');
+    }
+  }
+
+  /**
+   * Unsubscribe a callback from realtime OHLC bars.
+   * ZMQ unsubscribe is only called on the 1→0 transition (last subscriber).
+   */
+  unsubscribeFromTicker(ticker: string, periodSeconds: number, callback: BarUpdateCallback): void {
+    const topic = `${ticker}|ohlc:${periodSeconds}`;
+
+    const callbacks = this.barCallbacks.get(topic);
+    if (callbacks) {
+      callbacks.delete(callback);
+      if (callbacks.size === 0) {
+        this.barCallbacks.delete(topic);
+      }
+    }
+
+    const prev = this.topicRefs.get(topic) ?? 0;
+    if (prev <= 1) {
+      this.topicRefs.delete(topic);
+      if (this.subSocket) {
+        this.subSocket.unsubscribe(topic);
+        this.logger.info({ topic }, 'ZMQ unsubscribed from realtime topic');
+      }
+    } else {
+      this.topicRefs.set(topic, prev - 1);
+    }
+  }
+
+  /**
+   * Remove all subscriptions for a set of (topic, callback) pairs.
+   * Convenience method for WebSocket disconnect cleanup.
+   */
+  cleanupSubscriptions(subscriptions: Array<{ ticker: string; periodSeconds: number; callback: BarUpdateCallback }>): void {
+    for (const { ticker, periodSeconds, callback } of subscriptions) {
+      this.unsubscribeFromTicker(ticker, periodSeconds, callback);
+    }
+  }
+
  /**
   * Close the client and cleanup resources
   */
--- a/gateway/src/harness/agent-harness.ts
+++ b/gateway/src/harness/agent-harness.ts
@@ -4,6 +4,7 @@ import type { FastifyBaseLogger } from 'fastify';
 import type { License } from '../types/user.js';
 import { ChannelType } from '../types/user.js';
 import type { ConversationStore } from './memory/conversation-store.js';
+import type { BlobStore } from './memory/blob-store.js';
 import type { InboundMessage, OutboundMessage } from '../types/messages.js';
 import { MCPClientConnector } from './mcp-client.js';
 import { LLMProviderFactory, type ProviderConfig } from '../llm/provider.js';
@@ -14,13 +15,16 @@ import type { ChannelAdapter, PathTriggerContext } from '../workspace/index.js';
 import type { ResearchSubagent } from './subagents/research/index.js';
 import type { IndicatorSubagent } from './subagents/indicator/index.js';
 import type { WebExploreSubagent } from './subagents/web-explore/index.js';
+import type { StrategySubagent } from './subagents/strategy/index.js';
 import type { DynamicStructuredTool } from '@langchain/core/tools';
 import { getToolRegistry } from '../tools/tool-registry.js';
 import type { MCPToolInfo } from '../tools/mcp/mcp-tool-wrapper.js';
 import { createResearchAgentTool } from '../tools/platform/research-agent.tool.js';
 import { createIndicatorAgentTool } from '../tools/platform/indicator-agent.tool.js';
 import { createWebExploreAgentTool } from '../tools/platform/web-explore-agent.tool.js';
+import { createStrategyAgentTool } from '../tools/platform/strategy-agent.tool.js';
 import { createUserContext } from './memory/session-context.js';
+import type { HarnessEvent } from './harness-events.js';
 import { readFile } from 'fs/promises';
 import { join, dirname } from 'path';
 import { fileURLToPath } from 'url';
@@ -54,10 +58,12 @@ export type HarnessFactory = (sessionConfig: HarnessSessionConfig) => AgentHarne
 export interface AgentHarnessConfig extends HarnessSessionConfig {
  providerConfig: ProviderConfig;
  conversationStore?: ConversationStore;
+  blobStore?: BlobStore;
  historyLimit: number;
  researchSubagent?: ResearchSubagent;
  indicatorSubagent?: IndicatorSubagent;
  webExploreSubagent?: WebExploreSubagent;
+  strategySubagent?: StrategySubagent;
 }

 /**
@@ -87,6 +93,8 @@ export class AgentHarness {
  private conversationStore?: ConversationStore;
  private indicatorSubagent?: IndicatorSubagent;
  private webExploreSubagent?: WebExploreSubagent;
+  private strategySubagent?: StrategySubagent;
+  private blobStore?: BlobStore;
  private abortController: AbortController | null = null;

  constructor(config: AgentHarnessConfig) {
@@ -96,10 +104,12 @@ export class AgentHarness {
    this.researchSubagent = config.researchSubagent;
    this.indicatorSubagent = config.indicatorSubagent;
    this.webExploreSubagent = config.webExploreSubagent;
+    this.strategySubagent = config.strategySubagent;

    this.modelFactory = new LLMProviderFactory(config.providerConfig, config.logger);
    this.modelRouter = new ModelRouter(this.modelFactory, config.logger);
    this.conversationStore = config.conversationStore;
+    this.blobStore = config.blobStore;

    this.mcpClient = new MCPClientConnector({
      userId: config.userId,
@@ -419,17 +429,75 @@ export class AgentHarness {
    }
  }

+  /**
+   * Initialize strategy subagent
+   */
+  private async initializeStrategySubagent(): Promise<void> {
+    if (this.strategySubagent) {
+      this.config.logger.debug('Strategy subagent already provided');
+      return;
+    }
+
+    this.config.logger.debug('Creating strategy subagent for session');
+
+    try {
+      const { createStrategySubagent } = await import('./subagents/strategy/index.js');
+
+      const { model } = await this.modelRouter.route(
+        'trading strategy writing and backtesting',
+        this.config.license,
+        RoutingStrategy.COMPLEXITY,
+        this.config.userId
+      );
+
+      const toolRegistry = getToolRegistry();
+      const strategyTools = await toolRegistry.getToolsForAgent(
+        'strategy',
+        this.mcpClient,
+        this.availableMCPTools,
+        this.workspaceManager,
+        undefined,
+        undefined
+      );
+
+      const strategySubagentPath = join(__dirname, 'subagents', 'strategy');
+      this.config.logger.debug({ strategySubagentPath }, 'Using strategy subagent path');
+
+      this.strategySubagent = await createStrategySubagent(
+        model,
+        this.config.logger,
+        strategySubagentPath,
+        this.mcpClient,
+        strategyTools
+      );
+
+      this.config.logger.info(
+        {
+          toolCount: strategyTools.length,
+          toolNames: strategyTools.map(t => t.name),
+        },
+        'Strategy subagent created successfully'
+      );
+    } catch (error) {
+      this.config.logger.error(
+        { error, errorMessage: (error as Error).message, stack: (error as Error).stack },
+        'Failed to create strategy subagent'
+      );
+      // Don't throw — strategy subagent is optional
+    }
+  }
+
  /**
   * Execute model with tool calling loop
   * Handles multi-turn tool calls until the model produces a final text response
   */
-  private async executeWithToolCalling(
+  private async *executeWithToolCalling(
    model: any,
    messages: BaseMessage[],
    tools: DynamicStructuredTool[],
    maxIterations: number = 2,
    signal?: AbortSignal
-  ): Promise<string> {
+  ): AsyncGenerator<HarnessEvent> {
    this.config.logger.info(
      { toolCount: tools.length, maxIterations },
      'Starting tool calling loop'
@@ -437,6 +505,8 @@ export class AgentHarness {

    const messagesCopy = [...messages];
    let iterations = 0;
+    // Track last char of last yielded text chunk to detect missing spaces between tokens
+    let lastChunkTail = '';

    while (iterations < maxIterations) {
      if (signal?.aborted) break;
@@ -455,15 +525,24 @@ export class AgentHarness {
      try {
        const stream = await model.stream(messagesCopy, { signal });
        for await (const chunk of stream) {
+          const contents: string[] = [];
          if (typeof chunk.content === 'string' && chunk.content.length > 0) {
-            this.channelAdapter?.sendChunk(chunk.content);
+            contents.push(chunk.content);
          } else if (Array.isArray(chunk.content)) {
            for (const block of chunk.content) {
-              if (block.type === 'text' && block.text) {
-                this.channelAdapter?.sendChunk(block.text);
-              }
+              if (block.type === 'text' && block.text) contents.push(block.text);
            }
          }
+          for (const content of contents) {
+            // DeepInfra/GLM streams tokens without leading spaces; inject one when
+            // both the tail of the previous chunk and the head of this chunk are
+            // word characters (\w), which would otherwise merge two words.
+            if (lastChunkTail && /\w/.test(lastChunkTail) && /\w/.test(content[0])) {
+              yield { type: 'chunk', content: ' ' };
+            }
+            lastChunkTail = content[content.length - 1];
+            yield { type: 'chunk', content };
+          }
          response = response ? response.concat(chunk) : chunk;
        }
      } catch (invokeError: any) {
@@ -486,6 +565,8 @@ export class AgentHarness {
          contentLength: typeof response.content === 'string' ? response.content.length : 0,
          hasToolCalls: !!response.tool_calls,
          toolCallCount: response.tool_calls?.length || 0,
+          usageMetadata: (response as any).usage_metadata,
+          finishReason: (response as any).response_metadata?.finish_reason,
        },
        'Model response received'
      );
@@ -508,7 +589,8 @@ export class AgentHarness {
          { finalContentLength: finalContent.length, iterations },
          'Tool calling loop complete - no more tool calls'
        );
-        return finalContent;
+        yield { type: 'done', content: finalContent };
+        return;
      }

      this.config.logger.info(
@@ -540,11 +622,32 @@ export class AgentHarness {
        }

        try {
-          this.channelAdapter?.sendToolCall?.(toolCall.name, this.getToolLabel(toolCall.name));
-          const result = await tool.func(toolCall.args);
+          yield { type: 'tool_call', toolName: toolCall.name, label: this.getToolLabel(toolCall.name) };

-          // Process result to extract images and send them via channel adapter
-          const processedResult = this.processToolResult(result, toolCall.name);
+          // Use streamFunc when available (subagent tools) to forward intermediate events inline
+          let result: string;
+          const streamFunc = (tool as any).streamFunc as ((args: any, signal?: AbortSignal) => AsyncGenerator<import('./harness-events.js').HarnessEvent, string>) | undefined;
+          if (streamFunc) {
+            const gen = streamFunc(toolCall.args, signal);
+            let next = await gen.next();
+            while (!next.done) {
+              if (signal?.aborted) {
+                gen.return?.('');
+                break;
+              }
+              yield next.value;
+              next = await gen.next();
+            }
+            result = next.done ? next.value : '';
+          } else {
+            result = await tool.func(toolCall.args);
+          }
+
+          // Extract images from result and yield them; get text-only version for LLM
+          const { cleanedResult: processedResult, images } = this.extractImagesFromToolResult(result, toolCall.name);
+          for (const img of images) {
+            yield { type: 'image', data: img.data, mimeType: img.mimeType, caption: img.caption };
+          }

          this.config.logger.debug(
            {
@@ -567,6 +670,12 @@ export class AgentHarness {
            'Tool execution completed'
          );
        } catch (error) {
+          // Clean stop — abort signal fired during tool execution; exit without error message
+          if (signal?.aborted || (error as Error)?.name === 'AbortError') {
+            this.config.logger.info({ tool: toolCall.name }, 'Tool execution aborted by stop signal');
+            return;
+          }
+
          this.config.logger.error(
            {
              error,
@@ -578,6 +687,8 @@ export class AgentHarness {
            'Tool execution failed'
          );

+          yield { type: 'error' as const, source: toolCall.name, fatal: false };
+
          messagesCopy.push(
            new ToolMessage({
              content: `Error: ${error}`,
@@ -586,11 +697,15 @@ export class AgentHarness {
          );
        }
      }
+
+      // After all tool calls complete, emit a space separator before the next LLM streaming pass
+      yield { type: 'chunk', content: ' ' };
+      lastChunkTail = ' ';
    }

-    // Max iterations reached - return what we have
+    // Max iterations reached - yield done with apology
    this.config.logger.warn('Max tool calling iterations reached');
-    return 'I apologize, but I encountered an issue processing your request. Please try rephrasing your question.';
+    yield { type: 'done', content: 'I apologize, but I encountered an issue processing your request. Please try rephrasing your question.' };
  }

  /**
@@ -617,162 +732,222 @@ export class AgentHarness {
  }

  /**
-   * Handle incoming message from user
+   * Stream events for an incoming user message.
+   * Yields typed HarnessEvents (chunk, tool_call, image, done) and saves the
+   * conversation to the store once the done event has been emitted.
   */
-  async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
+  async *streamMessage(message: InboundMessage): AsyncGenerator<HarnessEvent> {
    this.config.logger.info(
      { messageId: message.messageId, userId: message.userId, content: message.content.substring(0, 100) },
      'Processing user message'
    );

-    try {
-      // 1. Build system prompt from template
-      this.config.logger.debug('Building system prompt');
-      const systemPrompt = await this.buildSystemPrompt();
-      this.config.logger.debug({ systemPromptLength: systemPrompt.length }, 'System prompt built');
+    // 1. Build system prompt from template
+    this.config.logger.debug('Building system prompt');
+    const systemPrompt = await this.buildSystemPrompt();
+    this.config.logger.debug({ systemPromptLength: systemPrompt.length }, 'System prompt built');

-      // 2. Load recent conversation history
-      const channelKey = this.config.channelType ?? ChannelType.WEBSOCKET;
-      let storedMessages = this.conversationStore
-        ? await this.conversationStore.getRecentMessages(
-            this.config.userId, this.config.sessionId, this.config.historyLimit, channelKey
-          )
-        : [];
-
-      // First turn: seed conversation history with current workspace state
-      if (storedMessages.length === 0 && this.workspaceManager && this.conversationStore) {
-        const workspaceJSON = this.workspaceManager.serializeState();
-        const content = `[Workspace State]\n\`\`\`json\n${workspaceJSON}\n\`\`\``;
-        await this.conversationStore.saveMessage(
-          this.config.userId, this.config.sessionId,
-          'workspace', content, { isWorkspaceContext: true }, channelKey
-        );
-        storedMessages = await this.conversationStore.getRecentMessages(
+    // 2. Load recent conversation history
+    const channelKey = this.config.channelType ?? ChannelType.WEBSOCKET;
+    let storedMessages = this.conversationStore
+      ? await this.conversationStore.getRecentMessages(
          this.config.userId, this.config.sessionId, this.config.historyLimit, channelKey
-        );
-      }
+        )
+      : [];

-      const history = this.conversationStore
-        ? this.conversationStore.toLangChainMessages(storedMessages)
-        : [];
-      this.config.logger.debug({ historyLength: history.length }, 'Conversation history loaded');
-
-      // 4. Get the configured model
-      this.config.logger.debug('Routing to model');
-      const { model, middleware } = await this.modelRouter.route(
-        message.content,
-        this.config.license,
-        RoutingStrategy.COMPLEXITY,
-        this.config.userId
+    // First turn: seed conversation history with current workspace state
+    if (storedMessages.length === 0 && this.workspaceManager && this.conversationStore) {
+      const workspaceJSON = this.workspaceManager.serializeState();
+      const content = `[Workspace State]\n\`\`\`json\n${workspaceJSON}\n\`\`\``;
+      await this.conversationStore.saveMessage(
+        this.config.userId, this.config.sessionId,
+        'workspace', content, { isWorkspaceContext: true }, channelKey
      );
-      this.middleware = middleware;
-      this.config.logger.info({ modelName: model.constructor.name }, 'Model selected');
-
-      // 5. Build LangChain messages
-      const langchainMessages = this.buildLangChainMessages(systemPrompt, history, message.content);
-      this.config.logger.debug({ messageCount: langchainMessages.length }, 'LangChain messages built');
-
-      // 6. Get tools for main agent from registry
-      const toolRegistry = getToolRegistry();
-      const tools = await toolRegistry.getToolsForAgent(
-        'main',
-        this.mcpClient,
-        this.availableMCPTools,
-        this.workspaceManager  // Pass session workspace manager
+      storedMessages = await this.conversationStore.getRecentMessages(
+        this.config.userId, this.config.sessionId, this.config.historyLimit, channelKey
      );
+    }

-      // Build shared subagent context
-      const subagentContext = {
-        userContext: createUserContext({
-          userId: this.config.userId,
-          sessionId: this.config.sessionId,
-          license: this.config.license,
-          channelType: this.config.channelType ?? ChannelType.WEBSOCKET,
-          channelUserId: this.config.channelUserId ?? this.config.userId,
-        }),
-      };
+    const history = this.conversationStore
+      ? this.conversationStore.toLangChainMessages(storedMessages)
+      : [];
+    this.config.logger.debug({ historyLength: history.length }, 'Conversation history loaded');

-      // Add research subagent as a tool if available
-      if (this.researchSubagent) {
-        tools.push(createResearchAgentTool({
-          researchSubagent: this.researchSubagent,
-          context: subagentContext,
-          logger: this.config.logger,
-        }));
-      }
+    // 4. Get the configured model
+    this.config.logger.debug('Routing to model');
+    const { model, middleware } = await this.modelRouter.route(
+      message.content,
+      this.config.license,
+      RoutingStrategy.COMPLEXITY,
+      this.config.userId
+    );
+    this.middleware = middleware;
+    this.config.logger.info({ modelName: model.constructor.name }, 'Model selected');

-      // Add indicator subagent as a tool if available
-      if (this.indicatorSubagent) {
-        tools.push(createIndicatorAgentTool({
-          indicatorSubagent: this.indicatorSubagent,
-          context: subagentContext,
-          logger: this.config.logger,
-        }));
-      }
+    // 5. Build LangChain messages
+    const langchainMessages = this.buildLangChainMessages(systemPrompt, history, message.content);
+    this.config.logger.debug({ messageCount: langchainMessages.length }, 'LangChain messages built');

-      // Add web explore subagent as a tool if available
-      if (this.webExploreSubagent) {
-        tools.push(createWebExploreAgentTool({
-          webExploreSubagent: this.webExploreSubagent,
-          context: subagentContext,
-          logger: this.config.logger,
-        }));
-      }
+    // 6. Get tools for main agent from registry
+    const toolRegistry = getToolRegistry();
+    const tools = await toolRegistry.getToolsForAgent(
+      'main',
+      this.mcpClient,
+      this.availableMCPTools,
+      this.workspaceManager
+    );

+    // Build shared subagent context
+    const subagentContext = {
+      userContext: createUserContext({
+        userId: this.config.userId,
+        sessionId: this.config.sessionId,
+        license: this.config.license,
+        channelType: this.config.channelType ?? ChannelType.WEBSOCKET,
+        channelUserId: this.config.channelUserId ?? this.config.userId,
+      }),
+    };
+
+    if (this.researchSubagent) {
+      tools.push(createResearchAgentTool({
+        researchSubagent: this.researchSubagent,
+        context: subagentContext,
+        logger: this.config.logger,
+      }));
+    }
+
+    if (this.indicatorSubagent) {
+      tools.push(createIndicatorAgentTool({
+        indicatorSubagent: this.indicatorSubagent,
+        context: subagentContext,
+        logger: this.config.logger,
+      }));
+    }
+
+    if (this.webExploreSubagent) {
+      tools.push(createWebExploreAgentTool({
+        webExploreSubagent: this.webExploreSubagent,
+        context: subagentContext,
+        logger: this.config.logger,
+      }));
+    }
+
+    if (!this.strategySubagent) {
+      await this.initializeStrategySubagent();
+    }
+    if (this.strategySubagent) {
+      tools.push(createStrategyAgentTool({
+        strategySubagent: this.strategySubagent,
+        context: subagentContext,
+        logger: this.config.logger,
+      }));
+    }
+
+    this.config.logger.info(
+      { toolCount: tools.length, toolNames: tools.map(t => t.name) },
+      'Tools loaded for main agent'
+    );
+
+    // Apply middleware (e.g. Anthropic prompt caching)
+    const processedMessages = this.middleware
+      ? this.middleware.processMessages(langchainMessages, tools)
+      : langchainMessages;
+
+    // 7. Bind tools to model
+    const modelWithTools = tools.length > 0 && model.bindTools ? model.bindTools(tools) : model;
+
+    if (tools.length > 0) {
      this.config.logger.info(
-        {
-          toolCount: tools.length,
-          toolNames: tools.map(t => t.name),
-        },
-        'Tools loaded for main agent'
+        { modelType: modelWithTools.constructor.name, toolsBound: tools.length > 0 && !!model.bindTools },
+        'Model bound with tools'
      );
+    }

-      // Apply middleware (e.g. Anthropic prompt caching)
-      const processedMessages = this.middleware
-        ? this.middleware.processMessages(langchainMessages, tools)
-        : langchainMessages;
-
-      // 7. Bind tools to model
-      const modelWithTools = tools.length > 0 && model.bindTools ? model.bindTools(tools) : model;
-
-      if (tools.length > 0) {
-        this.config.logger.info(
-          { modelType: modelWithTools.constructor.name, toolsBound: tools.length > 0 && !!model.bindTools },
-          'Model bound with tools'
-        );
+    // 8. Stream tool calling loop and save conversation on completion
+    this.config.logger.info('Invoking LLM with tool support');
+    this.abortController = new AbortController();
+    let finalContent = '';
+    const collectedImages: Array<{ data: string; mimeType: string; caption?: string }> = [];
+    try {
+      for await (const event of this.executeWithToolCalling(modelWithTools, processedMessages, tools, 10, this.abortController.signal)) {
+        if (event.type === 'done') {
+          finalContent = event.content;
+          this.config.logger.info({ responseLength: finalContent.length }, 'LLM response received');
+        } else if (event.type === 'image') {
+          collectedImages.push({ data: event.data, mimeType: event.mimeType, caption: event.caption });
+        }
+        yield event;
      }
-
-      // 8. Call LLM with tool calling loop
-      this.config.logger.info('Invoking LLM with tool support');
-      this.abortController = new AbortController();
-      const assistantMessage = await this.executeWithToolCalling(modelWithTools, processedMessages, tools, 10, this.abortController.signal);
+    } catch (error) {
+      if ((error as Error)?.name === 'AbortError') {
+        this.config.logger.info('Agent harness interrupted by stop signal');
+      } else {
+        this.config.logger.error({ error }, 'Fatal error in agent harness');
+        yield { type: 'error' as const, source: 'agent harness', fatal: true };
+      }
+    } finally {
      this.abortController = null;
+      if (finalContent && this.conversationStore) {
+        // Write blobs to S3 and capture their IDs for message metadata
+        let blobRefs: Array<{ id: string; mimeType: string; caption?: string }> = [];
+        if (collectedImages.length > 0 && this.blobStore) {
+          const assistantMsgId = `${this.config.userId}:${this.config.sessionId}:${Date.now()}`;
+          const blobIds = await this.blobStore.writeBlobs(
+            this.config.userId, this.config.sessionId, assistantMsgId,
+            collectedImages.map(img => ({ blobType: 'image' as const, mimeType: img.mimeType, data: img.data, caption: img.caption }))
+          );
+          blobRefs = blobIds.map((id, i) => ({ id, mimeType: collectedImages[i].mimeType, caption: collectedImages[i].caption }));
+        }

-      this.config.logger.info(
-        { responseLength: assistantMessage.length },
-        'LLM response received'
-      );
-
-      // Save user message and assistant response to conversation store
-      if (this.conversationStore) {
        await this.conversationStore.saveMessage(
          this.config.userId, this.config.sessionId, 'user', message.content, undefined, channelKey
        );
        await this.conversationStore.saveMessage(
-          this.config.userId, this.config.sessionId, 'assistant', assistantMessage, undefined, channelKey
+          this.config.userId, this.config.sessionId, 'assistant', finalContent,
+          blobRefs.length > 0 ? { blobs: blobRefs } : undefined,
+          channelKey
        );
      }
+    }
+  }

-      return {
-        messageId: `msg_${Date.now()}`,
-        sessionId: message.sessionId,
-        content: assistantMessage,
-        timestamp: new Date(),
-      };
+  /**
+   * Handle incoming message from user.
+   * Consumes streamMessage and dispatches events to the channel adapter for
+   * backward compatibility with Telegram and other non-streaming callers.
+   */
+  async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
+    let finalContent = '';
+    try {
+      for await (const event of this.streamMessage(message)) {
+        switch (event.type) {
+          case 'chunk':
+            this.channelAdapter?.sendChunk(event.content);
+            break;
+          case 'tool_call':
+            this.channelAdapter?.sendToolCall?.(event.toolName, event.label);
+            break;
+          case 'image':
+            this.channelAdapter?.sendImage({ data: event.data, mimeType: event.mimeType, caption: event.caption });
+            break;
+          case 'error':
+            this.channelAdapter?.sendText?.({ text: `An unrecoverable error occurred in the ${event.source}.` });
+            break;
+          case 'done':
+            finalContent = event.content;
+            break;
+        }
+      }
    } catch (error) {
      this.config.logger.error({ error }, 'Error processing message');
      throw error;
    }
+    return {
+      messageId: `msg_${Date.now()}`,
+      sessionId: message.sessionId,
+      content: finalContent,
+      timestamp: new Date(),
+    };
  }

  /**
@@ -817,21 +992,27 @@ export class AgentHarness {
      python_write: 'Coding...',
      python_read: 'Inspecting...',
      execute_research: 'Running script...',
-      backtest_strategy: 'Running backtest...',
+      backtest_strategy: 'Backtesting...',
      list_active_strategies: 'Checking active strategies...',
      web_explore: 'Searching the web...',
+      strategy: 'Coding a strategy...',
    };
-    return labels[toolName] ?? `Running ${toolName}...`;
+    return labels[toolName] ?? `Running ${toolName} tool...`;
  }

  /**
   * Process tool result to extract images and send via channel adapter.
   * Returns text-only version for LLM context (no base64 image data).
   */
-  private processToolResult(result: string, toolName: string): string {
+  private extractImagesFromToolResult(
+    result: string,
+    toolName: string
+  ): { cleanedResult: string; images: Array<{ data: string; mimeType: string; caption?: string }> } {
+    const noImages = { cleanedResult: String(result || ''), images: [] };
+
    // Most tools return plain strings - only process JSON results
    if (!result || typeof result !== 'string') {
-      return String(result || '');
+      return noImages;
    }

    // Try to parse as JSON
@@ -840,7 +1021,7 @@ export class AgentHarness {
      parsedResult = JSON.parse(result);
    } catch {
      // Not JSON, return as-is
-      return result;
+      return noImages;
    }

    // Check if result has images array (from ResearchSubagent)
@@ -850,19 +1031,11 @@ export class AgentHarness {
        'Extracting images from tool result'
      );

-      // Send each image via channel adapter
+      const images: Array<{ data: string; mimeType: string; caption?: string }> = [];
      for (const image of parsedResult.images) {
        if (image.data && image.mimeType) {
-          if (this.channelAdapter) {
-            this.config.logger.debug({ mimeType: image.mimeType }, 'Sending image to channel');
-            this.channelAdapter.sendImage({
-              data: image.data,
-              mimeType: image.mimeType,
-              caption: undefined,
-            });
-          } else {
-            this.config.logger.warn('No channel adapter set, cannot send image');
-          }
+          this.config.logger.debug({ mimeType: image.mimeType }, 'Extracted image from tool result');
+          images.push({ data: image.data, mimeType: image.mimeType, caption: undefined });
        }
      }

@@ -872,15 +1045,13 @@ export class AgentHarness {
        images: undefined,
        imageCount: parsedResult.images.length,
      };
-
-      // Clean up undefined values
      Object.keys(textOnlyResult).forEach(key => {
        if (textOnlyResult[key] === undefined) {
          delete textOnlyResult[key];
        }
      });

-      return JSON.stringify(textOnlyResult);
+      return { cleanedResult: JSON.stringify(textOnlyResult), images };
    }

    // Check for nested chart_images object
@@ -890,20 +1061,12 @@ export class AgentHarness {
        'Extracting chart images from tool result'
      );

-      // Send each chart image via channel adapter
+      const images: Array<{ data: string; mimeType: string; caption?: string }> = [];
      for (const [chartId, chartData] of Object.entries(parsedResult.chart_images)) {
        const chart = chartData as any;
        if (chart.type === 'image' && chart.data) {
-          if (this.channelAdapter) {
-            this.config.logger.debug({ chartId }, 'Sending chart image to channel');
-            this.channelAdapter.sendImage({
-              data: chart.data,
-              mimeType: 'image/png',
-              caption: undefined,
-            });
-          } else {
-            this.config.logger.warn('No channel adapter set, cannot send chart image');
-          }
+          this.config.logger.debug({ chartId }, 'Extracted chart image from tool result');
+          images.push({ data: chart.data, mimeType: 'image/png', caption: undefined });
        }
      }

@@ -913,19 +1076,17 @@ export class AgentHarness {
        chart_images: undefined,
        chartCount: Object.keys(parsedResult.chart_images).length,
      };
-
-      // Clean up undefined values
      Object.keys(textOnlyResult).forEach(key => {
        if (textOnlyResult[key] === undefined) {
          delete textOnlyResult[key];
        }
      });

-      return JSON.stringify(textOnlyResult);
+      return { cleanedResult: JSON.stringify(textOnlyResult), images };
    }

-    // No images found, return stringified result
-    return result;
+    // No images found, return as-is
+    return { cleanedResult: result, images: [] };
  }

  /**
--- a/gateway/src/harness/harness-events.ts
+++ b/gateway/src/harness/harness-events.ts
@@ -0,0 +1,51 @@
+export interface ChunkEvent {
+  type: 'chunk';
+  content: string;
+}
+
+export interface ToolCallEvent {
+  type: 'tool_call';
+  toolName: string;
+  label: string;
+}
+
+export interface ImageEvent {
+  type: 'image';
+  data: string;
+  mimeType: string;
+  caption?: string;
+}
+
+export interface DoneEvent {
+  type: 'done';
+  content: string;
+}
+
+export interface SubagentChunkEvent {
+  type: 'subagent_chunk';
+  agentName: string;
+  content: string;
+}
+
+export interface SubagentThinkingEvent {
+  type: 'subagent_thinking';
+  agentName: string;
+  content: string;
+}
+
+export interface SubagentToolCallEvent {
+  type: 'subagent_tool_call';
+  agentName: string;
+  toolName: string;
+  label: string;
+}
+
+export interface ErrorEvent {
+  type: 'error';
+  /** Name of the agent or tool where the error occurred */
+  source: string;
+  /** True if the error is unrecoverable and the chat session should end */
+  fatal: boolean;
+}
+
+export type HarnessEvent = ChunkEvent | ToolCallEvent | ImageEvent | DoneEvent | SubagentChunkEvent | SubagentThinkingEvent | SubagentToolCallEvent | ErrorEvent;
--- a/gateway/src/harness/mcp-client.ts
+++ b/gateway/src/harness/mcp-client.ts
@@ -57,57 +57,74 @@ export class MCPClientConnector {
      this.client = null;
    }

-    try {
-      this.config.logger.info(
-        { userId: this.config.userId, url: this.config.mcpServerUrl },
-        'Connecting to user MCP server'
-      );
+    const maxAttempts = 5;
+    const retryDelayMs = 1500;

-      this.client = new Client(
-        {
-          name: 'dexorder-gateway',
-          version: '0.1.0',
-        },
-        {
-          capabilities: {
-            sampling: {},
+    this.config.logger.info(
+      { userId: this.config.userId, url: this.config.mcpServerUrl },
+      'Connecting to user MCP server'
+    );
+
+    let lastError: unknown;
+    for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+      try {
+        this.client = new Client(
+          {
+            name: 'dexorder-gateway',
+            version: '0.1.0',
          },
-        }
-      );
+          {
+            capabilities: {
+              sampling: {},
+            },
+          }
+        );

-      // Streamable HTTP: single /mcp endpoint, session tracked via mcp-session-id header
-      const transport = new StreamableHTTPClientTransport(
-        new URL(`${this.config.mcpServerUrl}/mcp`)
-      );
+        // Streamable HTTP: single /mcp endpoint, session tracked via mcp-session-id header
+        const transport = new StreamableHTTPClientTransport(
+          new URL(`${this.config.mcpServerUrl}/mcp`)
+        );

-      await this.client.connect(transport);
+        await this.client.connect(transport);

-      // Hook client.onerror to detect transport failures (e.g. sandbox restart returning
-      // 404 "session not found").  When fired, mark disconnected so the next callTool /
-      // listTools call triggers a full reconnect + initialize handshake.
-      const connectedClient = this.client;
-      const origOnError = this.client.onerror;
-      this.client.onerror = (error) => {
-        origOnError?.(error);
-        // Only act on the currently-active client (ignore stale closures after reconnect)
-        if (this.client === connectedClient && this.connected) {
+        // Hook client.onerror to detect transport failures (e.g. sandbox restart returning
+        // 404 "session not found").  When fired, mark disconnected so the next callTool /
+        // listTools call triggers a full reconnect + initialize handshake.
+        const connectedClient = this.client;
+        const origOnError = this.client.onerror;
+        this.client.onerror = (error) => {
+          origOnError?.(error);
+          // Only act on the currently-active client (ignore stale closures after reconnect)
+          if (this.client === connectedClient && this.connected) {
+            this.config.logger.warn(
+              { error },
+              'MCP transport error — marking disconnected for lazy reconnect'
+            );
+            this.connected = false;
+          }
+        };
+
+        this.connected = true;
+        this.config.logger.info('Connected to user MCP server');
+        return;
+      } catch (error) {
+        lastError = error;
+        this.client = null;
+        if (attempt < maxAttempts) {
          this.config.logger.warn(
-            { error },
-            'MCP transport error — marking disconnected for lazy reconnect'
+            { error, userId: this.config.userId, attempt, maxAttempts },
+            'MCP connect attempt failed, retrying...'
          );
-          this.connected = false;
+          await new Promise(resolve => setTimeout(resolve, retryDelayMs));
        }
-      };
-
-      this.connected = true;
-      this.config.logger.info('Connected to user MCP server');
-    } catch (error) {
-      this.config.logger.error(
-        { error, userId: this.config.userId },
-        'Failed to connect to user MCP server'
-      );
-      throw error;
+      }
    }
+
+    this.config.logger.error(
+      { error: lastError, userId: this.config.userId },
+      'Failed to connect to user MCP server'
+    );
+    throw lastError;
  }

  /**
@@ -134,7 +151,9 @@ export class MCPClientConnector {
    try {
      this.config.logger.debug({ tool: name, args }, 'Calling MCP tool');

-      const result = await this.client!.callTool({ name, arguments: args });
+      // Use a generous timeout: execute_research runs a subprocess with a 300s limit,
+      // so the default 60s MCP SDK timeout would fire before the script completes.
+      const result = await this.client!.callTool({ name, arguments: args }, undefined, { timeout: 330000 });
      return result;
    } catch (error) {
      this.config.logger.error({ error, tool: name }, 'MCP tool call failed');
--- a/gateway/src/harness/memory/blob-store.ts
+++ b/gateway/src/harness/memory/blob-store.ts
@@ -0,0 +1,93 @@
+import type { FastifyBaseLogger } from 'fastify';
+import type { IcebergClient } from '../../clients/iceberg-client.js';
+
+export interface StoredBlob {
+  id: string;
+  userId: string;
+  sessionId: string;
+  messageId: string;
+  blobType: 'image' | 'audio';
+  mimeType: string;
+  data: string;      // base64
+  caption?: string;
+  timestamp: number; // microseconds
+}
+
+/**
+ * Blob store for binary attachments (images, audio) referenced by conversation messages.
+ *
+ * Unlike text messages (Redis hot + Iceberg cold), blobs write directly to S3 Parquet
+ * on each turn — they're infrequent enough that per-turn files don't cause fragmentation.
+ * Blob IDs are stored in the parent message's metadata field for later retrieval.
+ */
+export class BlobStore {
+  constructor(
+    private icebergClient: IcebergClient | undefined,
+    private logger: FastifyBaseLogger
+  ) {}
+
+  /**
+   * Write all blobs for one assistant turn to a single S3 Parquet file.
+   * Returns the blob IDs assigned. Failures are logged but do not throw.
+   */
+  async writeBlobs(
+    userId: string,
+    sessionId: string,
+    messageId: string,
+    blobs: Array<{ blobType: 'image' | 'audio'; mimeType: string; data: string; caption?: string }>
+  ): Promise<string[]> {
+    if (!this.icebergClient || blobs.length === 0) {
+      return [];
+    }
+
+    const now = Date.now();
+    const stored = blobs.map((b, i) => ({
+      id: `blob_${userId}_${now}_${i}`,
+      user_id: userId,
+      session_id: sessionId,
+      message_id: messageId,
+      blob_type: b.blobType,
+      mime_type: b.mimeType,
+      data: b.data,
+      caption: b.caption ?? null,
+      timestamp: now * 1000, // microseconds
+    }));
+
+    try {
+      await this.icebergClient.appendBlobs(userId, sessionId, messageId, stored);
+      this.logger.info({ userId, sessionId, count: stored.length }, 'Blobs written to S3');
+    } catch (error) {
+      this.logger.error({ error }, 'Failed to write blobs to S3');
+      // Don't throw — blob failure should not break the conversation turn
+    }
+
+    return stored.map(b => b.id);
+  }
+
+  /**
+   * Retrieve blobs by their IDs from S3/Iceberg cold storage.
+   */
+  async getBlobsByIds(userId: string, sessionId: string, blobIds: string[]): Promise<StoredBlob[]> {
+    if (!this.icebergClient || blobIds.length === 0) {
+      return [];
+    }
+
+    try {
+      const rows = await this.icebergClient.queryBlobs(userId, sessionId, blobIds);
+      return rows.map(r => ({
+        id: r.id,
+        userId: r.user_id,
+        sessionId: r.session_id,
+        messageId: r.message_id,
+        blobType: r.blob_type as 'image' | 'audio',
+        mimeType: r.mime_type,
+        data: r.data,
+        caption: r.caption ?? undefined,
+        timestamp: r.timestamp,
+      }));
+    } catch (error) {
+      this.logger.error({ error, blobIds }, 'Failed to retrieve blobs');
+      return [];
+    }
+  }
+}
--- a/gateway/src/harness/prompts/system-prompt.md
+++ b/gateway/src/harness/prompts/system-prompt.md
@@ -39,9 +39,9 @@ If the user asks for a capability not provided by Dexorder, decline and explain
 ## Task Delegation
 - For ANY research questions, deep analysis, statistical analysis, charting requests, or market data queries that require computation, you MUST use the 'research' tool
 - For ANYTHING related to indicators on the chart — reading, adding, removing, modifying, or creating custom indicators — you MUST use the 'indicator' tool
- For ANY backtesting request — running a strategy against historical data — you MUST use the 'backtest_strategy' tool directly; NEVER use the research tool for backtesting
+- For ANY request about trading strategies — writing, editing, backtesting, interpreting results, activating, deactivating, or monitoring — you MUST use the 'strategy' tool; NEVER write strategy Python code yourself
 - NEVER write Python code directly in your responses to the user
- NEVER show code to the user — delegate to the research or indicator tool instead
+- NEVER show code to the user — delegate to the research, indicator, or strategy tool instead
 - NEVER attempt to do analysis yourself — let the subagents handle it

 ## Available Tools
@@ -110,46 +110,54 @@ Parameters:
 - instruction: Natural language description of the analysis to perform (be specific!)
 - name: A unique name for the research script (e.g., "BTC Weekly Analysis")

+**Do NOT include any time range, history length, bar count, period size, resolution, or timestamp guidance in the instruction** — not as numbers, not as natural language ("3-6 months", "1 year", "sufficient data"), not at all. The research subagent has its own rules for selecting resolution and history window. If you add time guidance, the subagent will follow yours instead of its own (which uses much more data). Only pass time constraints if the user explicitly asked for a specific period (e.g. "last week", "show me 2023").
+
 Example usage:
 - User: "Does Friday price action correlate with Monday?"
 - You: Call research tool with instruction="Analyze correlation between Friday and Monday price action during NY trading hours (9:30-4:00 ET)", name="Friday-Monday Correlation"
+- WRONG: "...use hourly data and at least 3-6 months..."  ← never add this
+
+### strategy
+**Use this tool for ALL trading strategy requests without exception.**
+
+The strategy subagent handles the complete strategy lifecycle: writing PandasStrategy classes, running backtests, interpreting results, and activating/deactivating paper trading.
+
+**ALWAYS use strategy for:**
+- "Create a strategy that buys when RSI < 30" → write a new strategy
+- "Edit my momentum strategy to use a tighter stop" → modify existing strategy
+- "Backtest my RSI strategy over the last year" → run backtest
+- "How did this strategy perform on BTC?" → interpret results
+- "Activate my strategy for paper trading" → start paper trading
+- "What strategies are running?" → list active strategies
+- "Stop my momentum strategy" → deactivate a strategy
+- Any question about a strategy's PnL, trades, or performance
+
+**NEVER call `backtest_strategy`, `activate_strategy`, `deactivate_strategy`, or `list_active_strategies` directly** — always go through the strategy tool.
+
+**Custom indicators in strategies:**
+When writing a new strategy, the strategy subagent will first check for existing custom indicators via `python_list(category="indicator")`. Prefer using custom indicators (via `ta.custom_*`) over computing signals inline — this promotes reuse and gives users better visibility into strategy components. If a needed indicator doesn't exist yet, the strategy subagent will create it first via the indicator workflow.

 ### backtest_strategy
-**ALWAYS use this tool — and ONLY this tool — for any backtesting request.**
-
+*(Called internally by the strategy tool — do not call this directly.)*
 Runs a saved trading strategy against historical OHLC data using the Nautilus Trader backtesting engine.
-Returns structured performance metrics and an equity curve. Any charts generated are automatically sent to the user.
-
-**ALWAYS use backtest_strategy for:**
- "Backtest my RSI strategy over the last year"
- "How did this strategy perform on BTC?"
- "Run a backtest from January to June"
- Any request to test or evaluate a strategy on historical data
-
-**NEVER use research for backtesting** — the research tool cannot run strategies through the backtesting engine.
-
-After the tool returns, summarize the results clearly: total return, Sharpe ratio, max drawdown, win rate, and trade count. Present the equity curve description in plain language.
-
-Parameters:
- strategy_name: Display name of the saved strategy (use python_list with category="strategy" to check existing strategies)
- feeds: Array of `{symbol, period_seconds}` feed objects (e.g. `[{"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600}]`)
- from_time / to_time: Date strings ("2024-01-01", "90 days ago", "now") or Unix timestamps
- initial_capital: Starting balance in quote currency (default 10,000)
+Returns structured performance metrics including trade list, Sortino/Calmar ratios, and equity curve.

 ### list_active_strategies
+*(Called internally by the strategy tool — do not call this directly.)*
 Lists all currently active (live or paper) strategies and their status.
-Use this when the user asks what strategies are running.

 ### python_list
 List existing scripts in a category ("strategy", "indicator", or "research").
 Use this before calling the research tool to check whether a relevant script already exists.
 If one does, pass its exact name to the research tool so the subagent updates it rather than creating a new one.
-Also use before calling backtest_strategy to confirm the strategy name.
+The strategy tool uses this internally to check strategy names before backtesting.

 ### symbol-lookup
 Look up trading symbols and get metadata.
 Use this when users mention tickers or need symbol information.

+**Always use symbol_lookup to resolve a proper ticker before passing it to the research or get-chart-data tools.** Symbols must be in `SYMBOL.EXCHANGE` format (e.g., `BTC/USDT.BINANCE`). If the user says "ETHUSDT", "ETH", or any ambiguous ticker, resolve it first with symbol_lookup so the correct formatted ticker is passed downstream.
+
 ### get-chart-data
 **IMPORTANT: This is for QUICK, CASUAL information ONLY. This tool just returns raw data - it does NOT create charts or plots.**

--- a/gateway/src/harness/subagents/base-subagent.ts
+++ b/gateway/src/harness/subagents/base-subagent.ts
@@ -7,6 +7,7 @@ import type { MCPClientConnector } from '../mcp-client.js';
 import type { DynamicStructuredTool } from '@langchain/core/tools';
 import { readFile } from 'fs/promises';
 import { join } from 'path';
+import type { HarnessEvent, SubagentChunkEvent, SubagentThinkingEvent } from '../harness-events.js';

 /**
 * Subagent configuration (loaded from config.yaml)
@@ -122,6 +123,65 @@ export abstract class BaseSubagent {
    yield result;
  }

+  /**
+   * Extract subagent_chunk / subagent_thinking events from a LangGraph `messages` stream datum.
+   *
+   * LangGraph emits `[message_chunk, metadata]` tuples in `messages` mode. The message content
+   * can be a plain string (normal text token) or an array of content blocks (extended thinking
+   * responses with `{type:"thinking", thinking:"..."}` and `{type:"text", text:"..."}`).
+   */
+  static extractStreamChunks(
+    data: unknown,
+    agentName: string,
+  ): Array<SubagentChunkEvent | SubagentThinkingEvent> {
+    const msg = Array.isArray(data) ? (data as unknown[])[0] : data;
+    const content = (msg as any)?.content;
+    if (typeof content === 'string') {
+      return content ? [{ type: 'subagent_chunk', agentName, content }] : [];
+    }
+    if (Array.isArray(content)) {
+      const chunks: Array<SubagentChunkEvent | SubagentThinkingEvent> = [];
+      for (const block of content as any[]) {
+        if (block?.type === 'thinking' && typeof block.thinking === 'string' && block.thinking) {
+          chunks.push({ type: 'subagent_thinking', agentName, content: block.thinking });
+        } else if (block?.type === 'text' && typeof block.text === 'string' && block.text) {
+          chunks.push({ type: 'subagent_chunk', agentName, content: block.text });
+        }
+      }
+      return chunks;
+    }
+    return [];
+  }
+
+  /**
+   * Extract the final text from an `updates`-mode agent message.
+   * Handles both plain string content and array content blocks (extended thinking).
+   */
+  static extractFinalText(msg: any): string {
+    if (typeof msg?.content === 'string') return msg.content;
+    if (Array.isArray(msg?.content)) {
+      return (msg.content as any[])
+        .filter((b: any) => b?.type === 'text' && typeof b.text === 'string')
+        .map((b: any) => b.text as string)
+        .join('');
+    }
+    return '';
+  }
+
+  /**
+   * Stream typed HarnessEvents during execution.
+   * Subclasses override this to emit subagent_chunk / subagent_tool_call events
+   * using agent.stream() from LangGraph. Default falls back to execute().
+   */
+  async *streamEvents(
+    context: SubagentContext,
+    input: string,
+    _signal?: AbortSignal,
+  ): AsyncGenerator<HarnessEvent, string> {
+    const result = await this.execute(context, input);
+    return result;
+  }
+
  /**
   * Build messages with system prompt and memory context
   */
--- a/gateway/src/harness/subagents/index.ts
+++ b/gateway/src/harness/subagents/index.ts
@@ -11,3 +11,8 @@ export {
  createResearchSubagent,
  type ResearchResult,
 } from './research/index.js';
+
+export {
+  StrategySubagent,
+  createStrategySubagent,
+} from './strategy/index.js';
--- a/gateway/src/harness/subagents/indicator/index.ts
+++ b/gateway/src/harness/subagents/indicator/index.ts
@@ -4,6 +4,7 @@ import { SystemMessage } from '@langchain/core/messages';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import type { FastifyBaseLogger } from 'fastify';
 import type { MCPClientConnector } from '../../mcp-client.js';
+import type { HarnessEvent } from '../../harness-events.js';

 /**
 * Indicator Subagent
@@ -84,6 +85,56 @@ export class IndicatorSubagent extends BaseSubagent {

    return finalText;
  }
+
+  async *streamEvents(context: SubagentContext, instruction: string, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
+    this.logger.info({ subagent: this.getName() }, 'streamEvents starting');
+
+    if (!this.hasMCPClient()) {
+      throw new Error('MCP client not available for indicator subagent');
+    }
+
+    const initialMessages = this.buildMessages(context, instruction);
+    const systemMessage = initialMessages[0];
+    const humanMessage = initialMessages[initialMessages.length - 1];
+
+    const agent = createReactAgent({
+      llm: this.model,
+      tools: this.tools,
+      prompt: systemMessage as SystemMessage,
+    });
+
+    const stream = agent.stream(
+      { messages: [humanMessage] },
+      { streamMode: ['messages', 'updates'], recursionLimit: 25, signal }
+    );
+
+    let finalText = '';
+
+    for await (const [mode, data] of await stream) {
+      if (signal?.aborted) break;
+      if (mode === 'messages') {
+        for (const chunk of IndicatorSubagent.extractStreamChunks(data, this.config.name)) {
+          yield chunk;
+        }
+      } else if (mode === 'updates') {
+        if ((data as any).agent?.messages) {
+          for (const msg of (data as any).agent.messages as any[]) {
+            if (msg.tool_calls?.length) {
+              for (const tc of msg.tool_calls) {
+                yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: tc.name, label: tc.name };
+              }
+            } else {
+              const content = IndicatorSubagent.extractFinalText(msg);
+              if (content) finalText = content;
+            }
+          }
+        }
+      }
+    }
+
+    this.logger.info({ textLength: finalText.length }, 'streamEvents finished');
+    return finalText;
+  }
 }

 /**
--- a/gateway/src/harness/subagents/research/index.ts
+++ b/gateway/src/harness/subagents/research/index.ts
@@ -4,6 +4,7 @@ import { SystemMessage } from '@langchain/core/messages';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import type { FastifyBaseLogger } from 'fastify';
 import type { MCPClientConnector } from '../../mcp-client.js';
+import type { HarnessEvent } from '../../harness-events.js';

 /**
 * Result from research subagent execution
@@ -50,6 +51,58 @@ export class ResearchSubagent extends BaseSubagent {
    this.imageCapture = capture;
  }

+  /**
+   * Fetch custom indicators from the sandbox and return a formatted system prompt section.
+   * Returns empty string if there are no custom indicators or the call fails.
+   */
+  private async fetchCustomIndicatorsSection(): Promise<string> {
+    try {
+      const raw = await this.callMCPTool('python_list', { category: 'indicator' });
+      const r = raw as any;
+      const text = r?.content?.[0]?.text ?? r?.[0]?.text;
+      const parsed = typeof text === 'string' ? JSON.parse(text) : raw;
+      const items: any[] = parsed?.items ?? [];
+      if (items.length === 0) return '';
+
+      const lines: string[] = ['\n\n## Custom Indicators\n'];
+      lines.push('The user has defined the following custom indicators. Use `ta.custom_<name>` where `<name>` is the lowercase sanitized function name shown below.\n');
+
+      for (const item of items) {
+        const displayName: string = item.name ?? 'unknown';
+        const description: string = item.description ?? '';
+        const meta: any = item.metadata ?? {};
+        // Derive the ta attribute name: sanitize display name to lowercase + underscores
+        const taAttr = `custom_${displayName.toLowerCase().replace(/[^\w]/g, '_').replace(/_+/g, '_').replace(/^_+|_+$/g, '')}`;
+        const inputSeries: string[] = meta.input_series ?? ['close'];
+        const params: Record<string, any> = meta.parameters ?? {};
+        const pane: string = meta.pane ?? 'separate';
+
+        const inputStr = inputSeries.map((s: string) => `df['${s}']`).join(', ');
+        const paramStr = Object.entries(params)
+          .map(([k, v]: [string, any]) => `${k}=${JSON.stringify(v?.default ?? null)}`)
+          .join(', ');
+        const callExample = paramStr
+          ? `ta.${taAttr}(${inputStr}, ${paramStr})`
+          : `ta.${taAttr}(${inputStr})`;
+
+        const outputNames = (meta.output_columns ?? [{ name: 'value' }])
+          .map((c: any) => c.name)
+          .join(', ');
+
+        lines.push(`### ${displayName}`);
+        if (description) lines.push(description);
+        lines.push(`- **Call**: \`${callExample}\``);
+        lines.push(`- **Outputs**: ${outputNames}  |  **Pane**: ${pane}`);
+        lines.push('');
+      }
+
+      return lines.join('\n');
+    } catch (err) {
+      this.logger.warn({ err }, 'Failed to fetch custom indicators for prompt injection');
+      return '';
+    }
+  }
+
  /**
   * Execute research request using LangGraph's createReactAgent.
   * This is the standard LangChain pattern for agents with tool access —
@@ -79,11 +132,17 @@ export class ResearchSubagent extends BaseSubagent {
    this.imageCapture.length = 0;
    this.lastImages = [];

+    const customIndicatorsSection = await this.fetchCustomIndicatorsSection();
+
    // Build system prompt (with memory context appended)
    const initialMessages = this.buildMessages(context, instruction);
    // buildMessages returns [SystemMessage, ...history, HumanMessage]
    // Extract system content for createReactAgent's prompt parameter
-    const systemMessage = initialMessages[0];
+    let systemMessage = initialMessages[0] as SystemMessage;
+    if (customIndicatorsSection) {
+      const base = typeof systemMessage.content === 'string' ? systemMessage.content : JSON.stringify(systemMessage.content);
+      systemMessage = new SystemMessage(base + customIndicatorsSection);
+    }
    const humanMessage = initialMessages[initialMessages.length - 1];

    // createReactAgent is the standard LangChain/LangGraph pattern for tool-using agents.
@@ -91,12 +150,12 @@ export class ResearchSubagent extends BaseSubagent {
    const agent = createReactAgent({
      llm: this.model,
      tools: this.tools,
-      prompt: systemMessage as SystemMessage,
+      prompt: systemMessage,
    });

    const result = await agent.invoke(
      { messages: [humanMessage] },
-      { recursionLimit: 20 }
+      { recursionLimit: 40 }
    );

    // The final message in the graph output is the agent's last AIMessage
@@ -146,6 +205,109 @@ export class ResearchSubagent extends BaseSubagent {
    return this.lastImages;
  }

+  /**
+   * Stream typed HarnessEvents using LangGraph's agent.stream().
+   * Emits subagent_tool_call when tools fire, subagent_chunk for the final AI response.
+   * Returns the final text string as the generator return value.
+   */
+  async *streamEvents(context: SubagentContext, instruction: string, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
+    this.logger.info({ subagent: this.getName() }, 'streamEvents starting');
+
+    if (!this.hasMCPClient()) {
+      throw new Error('MCP client not available for research subagent');
+    }
+
+    this.imageCapture.length = 0;
+    this.lastImages = [];
+
+    // Emit immediately so the UI shows the subagent has started — LLM generation
+    // can take minutes with non-streaming models and nothing else reaches the UI until
+    // the first `updates` event fires (after the LLM finishes its first response).
+    yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: 'Thinking...', label: 'Thinking...' };
+
+    const customIndicatorsSection = await this.fetchCustomIndicatorsSection();
+
+    const initialMessages = this.buildMessages(context, instruction);
+    let systemMessage = initialMessages[0] as SystemMessage;
+    if (customIndicatorsSection) {
+      const base = typeof systemMessage.content === 'string' ? systemMessage.content : JSON.stringify(systemMessage.content);
+      systemMessage = new SystemMessage(base + customIndicatorsSection);
+    }
+    const humanMessage = initialMessages[initialMessages.length - 1];
+
+    const agent = createReactAgent({
+      llm: this.model,
+      tools: this.tools,
+      prompt: systemMessage,
+    });
+
+    this.logger.debug(
+      { toolCount: this.tools.length, toolNames: this.tools.map(t => t.name) },
+      'Research subagent: starting stream with tools'
+    );
+
+    const systemChars = typeof systemMessage.content === 'string'
+      ? systemMessage.content.length
+      : JSON.stringify(systemMessage.content).length;
+    const humanChars = typeof humanMessage.content === 'string'
+      ? humanMessage.content.length
+      : JSON.stringify(humanMessage.content).length;
+    this.logger.info(
+      { systemChars, humanChars, approxInputKB: Math.round((systemChars + humanChars) / 1024) },
+      'Research subagent: input context size'
+    );
+
+    const stream = agent.stream(
+      { messages: [humanMessage] },
+      { streamMode: ['messages', 'updates'], recursionLimit: 40, signal }
+    );
+
+    let finalText = '';
+    let updateCount = 0;
+
+    for await (const [mode, data] of await stream) {
+      if (signal?.aborted) break;
+      if (mode === 'messages') {
+        // Real-time token streaming from the LLM — data is [BaseMessage, metadata]
+        for (const chunk of ResearchSubagent.extractStreamChunks(data, this.config.name)) {
+          yield chunk;
+        }
+      } else if (mode === 'updates') {
+        updateCount++;
+        const updateKeys = Object.keys(data as any);
+        this.logger.debug({ updateCount, updateKeys }, 'Research subagent: graph update');
+        // Agent node fired — yield tool call decisions before tools run
+        if ((data as any).agent?.messages) {
+          for (const msg of (data as any).agent.messages as any[]) {
+            if (msg.tool_calls?.length) {
+              for (const tc of msg.tool_calls) {
+                yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: tc.name, label: tc.name };
+              }
+            } else {
+              // Capture final text for return value (already streamed via messages above)
+              const content = ResearchSubagent.extractFinalText(msg);
+              if (content) finalText = content;
+            }
+          }
+        }
+      }
+    }
+
+    this.lastImages = [...this.imageCapture];
+    if (!finalText) {
+      this.logger.warn(
+        { imageCount: this.lastImages.length },
+        'Research subagent: model returned empty output'
+      );
+    } else {
+      this.logger.info(
+        { textLength: finalText.length, imageCount: this.lastImages.length },
+        'streamEvents finished'
+      );
+    }
+    return finalText;
+  }
+
  /**
   * Stream research execution
   */
--- a/gateway/src/harness/subagents/research/memory/api-reference.md
+++ b/gateway/src/harness/subagents/research/memory/api-reference.md
@@ -421,6 +421,7 @@ For research scripts, import and use get_api() to access the API:
 """

 import logging
+import threading
 from typing import Optional

 from dexorder.api.api import API
@@ -432,10 +433,13 @@ log = logging.getLogger(__name__)
 # Global API instance - managed by main.py
 _global_api: Optional[API] = None

+# Thread-local API — used by harness threads so they don't overwrite the global
+_thread_local = threading.local()
+

 def get_api() -> API:
    """
-    Get the global API instance for accessing market data and charts.
+    Get the API instance for accessing market data and charts.

    Use this in research scripts to access the data and charting APIs.

@@ -462,15 +466,27 @@ def get_api() -> API:
        # Create chart
        fig, ax = api.charting.plot_ohlc(df, title="BTC/USDT")
    """
+    # Thread-local takes priority (set by harness threads)
+    api = getattr(_thread_local, 'api', None)
+    if api is not None:
+        return api
    if _global_api is None:
        raise RuntimeError("API not initialized")
    return _global_api


 def set_api(api: API) -> None:
-    """Set the global API instance. Internal use only."""
-    global _global_api
-    _global_api = api
+    """Set the API instance.
+
+    When called from the main thread, sets the global API used by all threads.
+    When called from a non-main thread (e.g. harness threads), sets a thread-local
+    API so the global is not overwritten.
+    """
+    if threading.current_thread() is threading.main_thread():
+        global _global_api
+        _global_api = api
+    else:
+        _thread_local.api = api


 __all__ = ['API', 'ChartingAPI', 'DataAPI', 'get_api', 'set_api']
--- a/gateway/src/harness/subagents/research/memory/usage-examples.md
+++ b/gateway/src/harness/subagents/research/memory/usage-examples.md
@@ -28,11 +28,12 @@ from datetime import datetime
 api = get_api()

 # Method 1: Using Unix timestamps (seconds)
+# 1609459200 = 2021-01-01, 1735689600 = 2025-01-01
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,  # 1 hour candles
-    start_time=1640000000,  # Unix timestamp in seconds
-    end_time=1640086400,
+    start_time=1609459200,  # 2021-01-01
+    end_time=1735689600,    # 2025-01-01 (~4 years, ~35,000 bars)
    extra_columns=["volume"]
 ))

@@ -40,8 +41,8 @@ df = asyncio.run(api.data.historical_ohlc(
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time="2021-12-20",  # Simple date string
-    end_time="2021-12-21",
+    start_time="2021-01-01",
+    end_time="2025-01-01",   # ~4 years of 1h bars ≈ 35,000 bars
    extra_columns=["volume"]
 ))

@@ -49,21 +50,24 @@ df = asyncio.run(api.data.historical_ohlc(
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time="2021-12-20 00:00:00",
-    end_time="2021-12-20 23:59:59",
+    start_time="2021-01-01 00:00:00",
+    end_time="2025-01-01 00:00:00",
    extra_columns=["volume"]
 ))

 # Method 4: Using datetime objects
+from datetime import datetime, timedelta
+end_time = datetime.now()
+start_time = end_time - timedelta(days=4*365)  # 4 years back
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time=datetime(2021, 12, 20),
-    end_time=datetime(2021, 12, 21),
+    start_time=start_time,
+    end_time=end_time,
    extra_columns=["volume"]
 ))

-print(f"Loaded {len(df)} candles")
+print(f"Loaded {len(df)} candles from {df.index[0]} to {df.index[-1]}")
 print(df.head())
 ```

@@ -94,8 +98,8 @@ api = get_api()
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time="2021-12-20",
-    end_time="2021-12-21",
+    start_time="2021-01-01",
+    end_time="2025-01-01",   # ~4 years of 1h bars
    extra_columns=["volume"]
 ))

@@ -125,8 +129,8 @@ api = get_api()
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time="2021-12-20",
-    end_time="2021-12-21"
+    start_time="2021-01-01",
+    end_time="2025-01-01"
 ))

 # Calculate indicators using pandas-ta
@@ -190,14 +194,19 @@ import pandas_ta as ta
 # Get API instance
 api = get_api()

-# Fetch historical data using date strings (easiest for research)
+# Fetch historical data — use max history for research (target 100k-200k bars)
+from datetime import datetime, timedelta
+end_time = datetime.now()
+start_time = end_time - timedelta(days=3*365)  # 3 years of 1h bars ≈ 26,000 bars
+
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,  # 1 hour
-    start_time="2021-12-20",
-    end_time="2021-12-21",
+    start_time=start_time,
+    end_time=end_time,
    extra_columns=["volume"]
 ))
+print(f"[Data] {len(df)} bars | {df.index[0]} → {df.index[-1]} | period=3600s")

 # Add moving averages using pandas-ta
 df['sma_20'] = ta.sma(df['close'], length=20)
@@ -218,7 +227,7 @@ ax.plot(range(len(df)), df['ema_50'], label="EMA 50", color="red", linewidth=1.5
 ax.legend()

 # Print summary statistics
-print(f"Period: {len(df)} candles")
+print(f"[Data] {len(df)} bars | {df.index[0]} → {df.index[-1]} | period=3600s")
 print(f"High: {df['high'].max()}")
 print(f"Low: {df['low'].min()}")
 print(f"Mean Volume: {df['volume'].mean():.2f}")
--- a/gateway/src/harness/subagents/research/system-prompt.md
+++ b/gateway/src/harness/subagents/research/system-prompt.md
@@ -10,6 +10,33 @@ Create Python scripts that:
 - Generate professional charts using matplotlib via the ChartingAPI
 - All matplotlib figures are automatically captured and sent to the user as images

+## Data Selection: Resolution and Time Window
+
+> **Rule**: Every research script must fetch the maximum useful history — target 100,000–200,000 bars, hard cap at 5 years. **Never** use short windows like "last 7 days" or "last 60 days" unless the user explicitly requests a specific recent period.
+
+Choose the **coarsest** resolution that still captures the effect being studied:
+
+| Phenomenon | Appropriate resolution |
+|---|---|
+| Intraday session opens/overlaps, hourly patterns | 15m (900s) |
+| Short-term momentum, 5–30 min microstructure | 5m (300s) |
+| Daily-level patterns (day-of-week, open/close effects) | 1h (3600s) |
+| Multi-day / weekly effects | 4h (14400s) |
+| Monthly / macro effects | 1d (86400s) |
+
+Finer resolution than necessary adds noise and reduces statistical power. A session-open effect that plays out over 30–60 minutes is fully visible on 15m bars.
+
+Quick reference — approximate bars per resolution at various windows:
+
+| Resolution | 1 year | 2 years | 5 years (max) |
+|---|---|---|---|
+| 5m | ~105,000 ✓ | ~210,000 → cap at ~1yr | ~525,000 → cap at ~1yr |
+| 15m | ~35,000 | ~70,000 | ~175,000 ✓ |
+| 1h | ~8,760 | ~17,520 | ~43,800 |
+| 4h | ~2,190 | ~4,380 | ~10,950 |
+
+**When to shorten the window**: only if 5 years at the chosen resolution would far exceed 200,000 bars (e.g., 5m over 5 years ≈ 525k → shorten to ~2 years). Otherwise always use the full 5 years.
+
 ## Available Tools

 You have direct access to these MCP tools:
@@ -17,13 +44,15 @@ You have direct access to these MCP tools:
 - **python_write**: Create a new script (research, strategy, or indicator category)
  - Required: category, name, description, code
  - Optional: metadata (category-specific fields — see below)
-  - For research: automatically executes the script after writing
-  - Returns validation results and execution output (text + images)
+  - **For research**: fully executes the script and returns all output (stdout, stderr) and captured chart images. The response IS the execution result — **do not call `execute_research` afterward**.
+  - **For indicator/strategy**: runs against synthetic test data to catch compile/runtime errors; no chart images are generated.
+  - Returns validation results and execution output (text + images for research)

 - **python_edit**: Update an existing script
  - Required: category, name
  - Optional: code, description, metadata
-  - For research: automatically re-executes if code is updated
+  - **For research**: re-executes the script when code is changed and returns all output and images. **Do not call `execute_research` afterward**.
+  - **For indicator/strategy**: re-runs the validation test only.
  - Returns validation results and execution output

 - **python_read**: Read an existing research script
@@ -32,8 +61,9 @@ You have direct access to these MCP tools:
 - **python_list**: List all research scripts
  - Returns: array of {name, description, metadata}

- **execute_research**: Manually run a research script
-  - Note: Usually not needed since write/edit auto-execute
+- **execute_research**: Run a research script that already exists on disk
+  - Use this **only** when the user explicitly asks to re-run a script, or to run a script that was written in a previous session and already exists
+  - **Do not call this after `python_write` or `python_edit`** — those tools already executed the script and returned its output
  - Returns: text output and images

 ## Research Script API
@@ -55,180 +85,8 @@ See your knowledge base for complete API documentation, examples, and the full p

 ## Technical Indicators — pandas-ta

-The sandbox environment uses **pandas-ta** as the standard indicator library. Always use it for technical indicator calculations; do not write manual rolling/ewm implementations.
+Use `import pandas_ta as ta` for all indicator calculations. Never write manual rolling/ewm implementations. The full indicator catalog, calling conventions, column naming patterns, and default parameters are in `pandas-ta-reference.md` in your knowledge base.

-```python
-import pandas_ta as ta
-```
-
-### Calling Convention
-
-pandas-ta functions accept a Series (or OHLCV columns) plus keyword parameters that match pandas-ta's documented argument names:
-
-```python
-# Single-series indicator
-rsi = ta.rsi(df['close'], length=14)          # returns Series
-
-# OHLCV indicator
-atr = ta.atr(df['high'], df['low'], df['close'], length=14)
-
-# Multi-output indicator (returns DataFrame)
-macd_df = ta.macd(df['close'], fast=12, slow=26, signal=9)
-# columns: MACD_12_26_9, MACDh_12_26_9, MACDs_12_26_9
-
-bbands_df = ta.bbands(df['close'], length=20, std=2.0)
-# columns: BBL_20_2.0, BBM_20_2.0, BBU_20_2.0, BBB_20_2.0, BBP_20_2.0
-```
-
-### Available Indicators (canonical list)
-
-These match the indicators supported by the TradingView web client. Use the pandas-ta function name shown here (lowercase):
-
-**Overlap / Moving Averages** — plotted on the price pane
-
-| Function | Description |
-|----------|-------------|
-| `sma` | Simple Moving Average — plain arithmetic mean over `length` periods |
-| `ema` | Exponential Moving Average — more weight on recent prices |
-| `wma` | Weighted Moving Average — linearly increasing weights |
-| `dema` | Double EMA — two layers of EMA to reduce lag |
-| `tema` | Triple EMA — three layers of EMA, even less lag than DEMA |
-| `trima` | Triangular MA — double-smoothed SMA, very smooth |
-| `kama` | Kaufman Adaptive MA — adapts speed to market noise/trending conditions |
-| `t3` | T3 Moving Average — Tillson's smooth, low-lag MA using six EMAs |
-| `hma` | Hull MA — very low-lag MA using WMAs |
-| `alma` | Arnaud Legoux MA — Gaussian-weighted MA with reduced lag and noise |
-| `midpoint` | Midpoint of close over `length` periods: (highest + lowest) / 2 |
-| `midprice` | Midpoint of high/low over `length` periods |
-| `supertrend` | Trend-following band (ATR-based) that flips above/below price |
-| `ichimoku` | Ichimoku Cloud — multi-line Japanese trend/support/resistance system |
-| `vwap` | Volume-Weighted Average Price — average price weighted by volume, resets on `anchor` |
-| `vwma` | Volume-Weighted MA — like SMA but candles weighted by volume |
-| `bbands` | Bollinger Bands — SMA ± N standard deviations; returns upper, mid, lower bands |
-
-**Momentum** — typically plotted in a separate pane
-
-| Function | Description |
-|----------|-------------|
-| `rsi` | Relative Strength Index — 0–100 oscillator measuring speed of price changes |
-| `macd` | MACD — difference of two EMAs plus signal line and histogram |
-| `stoch` | Stochastic Oscillator — %K/%D, measures close vs recent high/low range |
-| `stochrsi` | Stochastic RSI — applies stochastic formula to RSI values |
-| `cci` | Commodity Channel Index — deviation of price from its statistical mean |
-| `willr` | Williams %R — inverse stochastic, −100 to 0 oscillator |
-| `mom` | Momentum — raw price change over `length` periods |
-| `roc` | Rate of Change — percentage price change over `length` periods |
-| `trix` | TRIX — 1-period % change of a triple-smoothed EMA |
-| `cmo` | Chande Momentum Oscillator — ratio of up/down momentum, −100 to 100 |
-| `adx` | Average Directional Index — strength of trend (0–100, direction-agnostic) |
-| `aroon` | Aroon — measures how recently the highest/lowest price occurred; returns Up, Down, Oscillator |
-| `ao` | Awesome Oscillator — difference of 5- and 34-period simple MAs of midprice |
-| `bop` | Balance of Power — measures buying vs selling pressure: (close−open)/(high−low) |
-| `uo` | Ultimate Oscillator — weighted combo of three period (fast/medium/slow) buying pressure ratios |
-| `apo` | Absolute Price Oscillator — difference between two EMAs (like MACD without signal line) |
-| `mfi` | Money Flow Index — RSI-like oscillator using price × volume |
-| `coppock` | Coppock Curve — long-term momentum oscillator based on rate-of-change |
-| `dpo` | Detrended Price Oscillator — removes trend to show cycle oscillations |
-| `fisher` | Fisher Transform — converts price into a Gaussian normal distribution |
-| `rvgi` | Relative Vigor Index — compares close−open to high−low to measure trend vigor |
-| `kst` | Know Sure Thing — momentum oscillator from four ROC periods, smoothed |
-
-**Volatility** — plotted on price pane or separate
-
-| Function | Description |
-|----------|-------------|
-| `atr` | Average True Range — average of true range (greatest of H−L, H−prevC, L−prevC) |
-| `kc` | Keltner Channels — EMA ± N × ATR bands around price |
-| `donchian` | Donchian Channels — highest high / lowest low over `length` periods |
-
-**Volume** — plotted in separate pane
-
-| Function | Description |
-|----------|-------------|
-| `obv` | On Balance Volume — cumulative volume, added on up days, subtracted on down days |
-| `ad` | Accumulation/Distribution — running total of the money flow multiplier × volume |
-| `adosc` | Chaikin Oscillator — EMA difference of the A/D line |
-| `cmf` | Chaikin Money Flow — sum of (money flow volume) / sum of volume over `length` |
-| `eom` | Ease of Movement — relates price change to volume; high = price moves easily |
-| `efi` | Elder's Force Index — combines price change direction with volume magnitude |
-| `kvo` | Klinger Volume Oscillator — EMA difference of volume force |
-| `pvt` | Price Volume Trend — cumulative: volume × percentage price change |
-
-**Statistics / Price Transforms**
-
-| Function | Description |
-|----------|-------------|
-| `stdev` | Standard Deviation of close over `length` periods |
-| `linreg` | Linear Regression Curve — least-squares line endpoint value over `length` periods |
-| `slope` | Linear Regression Slope — gradient of the regression line |
-| `hl2` | Median Price — (high + low) / 2 |
-| `hlc3` | Typical Price — (high + low + close) / 3 |
-| `ohlc4` | Average Price — (open + high + low + close) / 4 |
-
-**Trend**
-
-| Function | Description |
-|----------|-------------|
-| `psar` | Parabolic SAR — trailing stop-and-reverse dots that follow price |
-| `vortex` | Vortex Indicator — VI+ / VI− lines measuring upward vs downward trend movement |
-| `chop` | Choppiness Index — 0–100, high = choppy/sideways, low = strong trend |
-
-### Default Parameters
-
-Key defaults to keep in mind:
- Most period/length indicators: `length=14` (use `length=` not `timeperiod=`)
- `bbands`: `length=20, std=2.0` (note: single `std`, not separate upper/lower)
- `macd`: `fast=12, slow=26, signal=9`
- `stoch`: `k=14, d=3, smooth_k=3`
- `psar`: `af0=0.02, af=0.02, max_af=0.2`
- `vwap`: `anchor='D'` (requires DatetimeIndex)
- `ichimoku`: `tenkan=9, kijun=26, senkou=52`
-
-For multi-output indicator column extraction patterns and complete charting examples, fetch `pandas-ta-reference.md` from your knowledge base.
-
-## Strategy Metadata Format
-
-When writing or editing a strategy (`category="strategy"`), always include a `metadata` object with:
-
- **`data_feeds`** — list of feed descriptors the strategy requires:
-  ```json
-  [
-    {"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600, "description": "Primary BTC/USDT hourly feed"},
-    {"symbol": "ETH/USDT.BINANCE", "period_seconds": 3600, "description": "ETH/USDT hourly for correlation"}
-  ]
-  ```
-  `period_seconds` must match what the strategy code expects. Use the same values when calling `backtest_strategy`.
-
- **`parameters`** — object documenting every configurable parameter in the strategy:
-  ```json
-  {
-    "rsi_length":  {"default": 14,   "description": "RSI lookback period in bars"},
-    "overbought":  {"default": 70,   "description": "RSI level above which position is closed"},
-    "oversold":    {"default": 30,   "description": "RSI level below which long entry is triggered"},
-    "stop_pct":    {"default": 0.02, "description": "Stop-loss as a fraction of entry price (e.g. 0.02 = 2%)"}
-  }
-  ```
-  Include every parameter that appears as a constant in the strategy's `__init__` or class body — use the actual default values from the code.
-
-Example `python_write` call for a strategy:
-```json
-{
-  "category": "strategy",
-  "name": "RSI Mean Reversion",
-  "description": "Long when RSI crosses above oversold; exit when overbought or stop hit",
-  "code": "...",
-  "metadata": {
-    "data_feeds": [
-      {"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600, "description": "BTC/USDT hourly OHLCV + order flow"}
-    ],
-    "parameters": {
-      "rsi_length": {"default": 14, "description": "RSI lookback period"},
-      "overbought":  {"default": 70, "description": "Exit long above this RSI level"},
-      "oversold":    {"default": 30, "description": "Enter long below this RSI level"}
-    }
-  }
-}
-```

 ## Coding Loop Pattern

@@ -244,11 +102,11 @@ When a user requests analysis:
   - Use appropriate ticker symbols, time ranges, and periods
   - The script will auto-execute after writing

-4. **Check execution results**: The tool returns:
-   - `validation.success`: Whether script ran without errors
-   - `validation.output`: Any stdout/stderr text output
-   - `execution.content`: Array of text and image results
-   - Note: Images are NOT included in your context - only text output is visible to you
+4. **Check execution results**: The tool returns the execution result directly — this is the script's actual output:
+   - `success`: Whether the script ran without errors
+   - Text output from stdout/stderr is visible to you
+   - Chart images are captured and sent to the user (you cannot see them)
+   - **Do NOT call `execute_research` after this step** — the script has already run and the results are in the response above

 5. **Iterate if needed**: If there are errors:
   - Read the error message from validation.output or execution text
@@ -259,8 +117,28 @@ When a user requests analysis:
   - The user will receive both your text response AND the chart images
   - Don't try to describe the images in detail - the user can see them

+## Ticker Format
+
+All tickers passed to `api.data.historical_ohlc()` and other data methods **must** use the `SYMBOL.EXCHANGE` format, e.g.:
+
+- `BTC/USDT.BINANCE`
+- `ETH/USDT.BINANCE`
+- `SOL/USDT.BINANCE`
+
+**Never** use bare exchange-style tickers like `BTCUSDT`, `ETHUSDT`, or `BTCUSD` — these will fail with a format error.
+
+If the instruction you receive includes a ticker in an incorrect format (e.g., `ETHUSDT`), convert it to the proper format (`ETH/USDT.BINANCE`) before writing the script. When in doubt about which exchange to use, default to `BINANCE`.
+
+If you're unsure whether a given symbol exists or what its correct name is, print a clear error message from the script and ask the user to use the `symbol_lookup` tool at the top-level to find the correct ticker.
+
 ## Important Guidelines

+- **Always print data stats after fetching**: Immediately after every `historical_ohlc` call, print the bar count and date range so it appears in the output:
+  ```python
+  print(f"[Data] {len(df)} bars | {df.index[0]} → {df.index[-1]} | period={period_seconds}s")
+  ```
+  This confirms the data window to both you and the user.
+
 - **Images are pass-through only**: Chart images go directly to the user. You only see text output (print statements, errors). Don't try to analyze or describe images you can't see.

 - **Async data fetching**: All `api.data` methods are async. Always use `asyncio.run()`:
@@ -268,15 +146,6 @@ When a user requests analysis:
  df = asyncio.run(api.data.historical_ohlc(...))
  ```

- **Charting is sync**: All `api.charting` methods are synchronous:
-  ```python
-  fig, ax = api.charting.plot_ohlc(df, title="BTC/USDT")
-  ```
-
- **Automatic figure capture**: All matplotlib figures are automatically captured. Don't save manually.
-
- **Print for debugging**: Use `print()` statements for debugging - you'll see this output.
-
 - **Package management**: If script needs packages beyond base environment (pandas, numpy, matplotlib):
  - Add `conda_packages: ["package-name"]` to metadata
  - Packages are auto-installed during validation
@@ -287,16 +156,18 @@ When a user requests analysis:

 ## Example Workflow

-User: "Show me BTC price action for the last 7 days with volume"
+User: "Show me BTC/ETH price correlation over time"

 You:
-1. Call `python_write` with:
-   - name: "BTC 7-Day Price Action"
-   - description: "BTC/USDT price and volume analysis for the last 7 days"
-   - code: (Python script that fetches data and creates chart)
-2. Check execution results
-3. If successful, respond: "I've created a 7-day BTC price chart with volume analysis. The chart shows [brief summary of what the script does]."
-4. User receives: Your text response + the actual chart image
+1. Identify timescale: daily return correlation → 1h bars are sufficient
+2. Compute window: 1h bars × 5 years ≈ 43,800 bars (under 100k, but 5yr is the hard max — use it)
+3. Call `python_write` with:
+   - name: "BTC ETH Price Correlation"
+   - description: "Rolling correlation of BTC/USDT and ETH/USDT daily returns using 5 years of 1h data"
+   - code: (Python script fetching 5yr of 1h OHLC for both tickers and plotting rolling correlation)
+4. Check execution results
+5. If successful, respond with a brief summary of what the script does
+6. User receives: Your text response + the chart image

 ## Response Format

--- a/gateway/src/harness/subagents/strategy/config.yaml
+++ b/gateway/src/harness/subagents/strategy/config.yaml
@@ -0,0 +1,37 @@
+name: strategy
+description: Writes and manages PandasStrategy classes, runs backtests, and manages strategy activation
+
+# Model configuration
+model: claude-sonnet-4-6
+temperature: 0.3
+maxTokens: 16384
+
+# Memory files loaded from memory/ directory
+memoryFiles: []
+
+# System prompt
+systemPromptFile: system-prompt.md
+
+# Capabilities
+capabilities:
+  - strategy_writing
+  - backtesting
+  - strategy_lifecycle
+
+# Tools available to this subagent
+tools:
+  platform: []
+  mcp:
+    - python_write
+    - python_edit
+    - python_read
+    - python_list
+    - python_log
+    - python_revert
+    - backtest_strategy
+    - activate_strategy
+    - deactivate_strategy
+    - list_active_strategies
+    - get_backtest_results
+    - get_strategy_trades
+    - get_strategy_events
--- a/gateway/src/harness/subagents/strategy/index.ts
+++ b/gateway/src/harness/subagents/strategy/index.ts
@@ -0,0 +1,159 @@
+import { BaseSubagent, type SubagentConfig, type SubagentContext } from '../base-subagent.js';
+import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
+import { SystemMessage } from '@langchain/core/messages';
+import { createReactAgent } from '@langchain/langgraph/prebuilt';
+import type { FastifyBaseLogger } from 'fastify';
+import type { MCPClientConnector } from '../../mcp-client.js';
+import type { HarnessEvent } from '../../harness-events.js';
+
+/**
+ * Strategy Subagent
+ *
+ * Specialized agent for writing PandasStrategy classes, running backtests,
+ * and managing strategy activation/deactivation.
+ *
+ * Mirrors the pattern of IndicatorSubagent in indicator/index.ts.
+ */
+export class StrategySubagent extends BaseSubagent {
+  constructor(
+    config: SubagentConfig,
+    model: BaseChatModel,
+    logger: FastifyBaseLogger,
+    mcpClient?: MCPClientConnector,
+    tools?: any[]
+  ) {
+    super(config, model, logger, mcpClient, tools);
+  }
+
+  /**
+   * Execute a strategy request using LangGraph's createReactAgent.
+   */
+  async execute(context: SubagentContext, instruction: string): Promise<string> {
+    this.logger.info(
+      {
+        subagent: this.getName(),
+        userId: context.userContext.userId,
+        instruction: instruction.substring(0, 200),
+        toolCount: this.tools.length,
+        toolNames: this.tools.map(t => t.name),
+      },
+      'Strategy subagent starting'
+    );
+
+    if (!this.hasMCPClient()) {
+      throw new Error('MCP client not available for strategy subagent');
+    }
+
+    if (this.tools.length === 0) {
+      this.logger.warn('Strategy subagent has no tools');
+    }
+
+    const initialMessages = this.buildMessages(context, instruction);
+    const systemMessage = initialMessages[0];
+    const humanMessage = initialMessages[initialMessages.length - 1];
+
+    const agent = createReactAgent({
+      llm: this.model,
+      tools: this.tools,
+      prompt: systemMessage as SystemMessage,
+    });
+
+    const result = await agent.invoke(
+      { messages: [humanMessage] },
+      { recursionLimit: 30 }
+    );
+
+    const allMessages: any[] = result.messages ?? [];
+
+    this.logger.info(
+      { messageCount: allMessages.length },
+      'Strategy subagent graph completed'
+    );
+
+    const lastAI = [...allMessages].reverse().find(
+      (m: any) => m.constructor?.name === 'AIMessage' || m._getType?.() === 'ai'
+    );
+
+    const finalText = lastAI
+      ? (typeof lastAI.content === 'string' ? lastAI.content : JSON.stringify(lastAI.content))
+      : 'Strategy task completed.';
+
+    this.logger.info({ textLength: finalText.length }, 'Strategy subagent finished');
+
+    return finalText;
+  }
+
+  async *streamEvents(context: SubagentContext, instruction: string, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
+    this.logger.info({ subagent: this.getName() }, 'streamEvents starting');
+
+    if (!this.hasMCPClient()) {
+      throw new Error('MCP client not available for strategy subagent');
+    }
+
+    const initialMessages = this.buildMessages(context, instruction);
+    const systemMessage = initialMessages[0];
+    const humanMessage = initialMessages[initialMessages.length - 1];
+
+    const agent = createReactAgent({
+      llm: this.model,
+      tools: this.tools,
+      prompt: systemMessage as SystemMessage,
+    });
+
+    const stream = agent.stream(
+      { messages: [humanMessage] },
+      { streamMode: ['messages', 'updates'], recursionLimit: 30, signal }
+    );
+
+    let finalText = '';
+
+    for await (const [mode, data] of await stream) {
+      if (signal?.aborted) break;
+      if (mode === 'messages') {
+        for (const chunk of StrategySubagent.extractStreamChunks(data, this.config.name)) {
+          yield chunk;
+        }
+      } else if (mode === 'updates') {
+        if ((data as any).agent?.messages) {
+          for (const msg of (data as any).agent.messages as any[]) {
+            if (msg.tool_calls?.length) {
+              for (const tc of msg.tool_calls) {
+                yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: tc.name, label: tc.name };
+              }
+            } else {
+              const content = StrategySubagent.extractFinalText(msg);
+              if (content) finalText = content;
+            }
+          }
+        }
+      }
+    }
+
+    this.logger.info({ textLength: finalText.length }, 'streamEvents finished');
+    return finalText;
+  }
+}
+
+/**
+ * Factory function to create and initialize StrategySubagent
+ */
+export async function createStrategySubagent(
+  model: BaseChatModel,
+  logger: FastifyBaseLogger,
+  basePath: string,
+  mcpClient?: MCPClientConnector,
+  tools?: any[]
+): Promise<StrategySubagent> {
+  const { readFile } = await import('fs/promises');
+  const { join } = await import('path');
+  const yaml = await import('js-yaml');
+
+  const configPath = join(basePath, 'config.yaml');
+  const configContent = await readFile(configPath, 'utf-8');
+  const config = yaml.load(configContent) as SubagentConfig;
+
+  const subagent = new StrategySubagent(config, model, logger, mcpClient, tools);
+  await subagent.initialize(basePath);
+
+  return subagent;
+}
--- a/gateway/src/harness/subagents/strategy/system-prompt.md
+++ b/gateway/src/harness/subagents/strategy/system-prompt.md
@@ -0,0 +1,357 @@
+# Strategy Subagent
+
+You are a specialized assistant for writing, testing, and managing trading strategies on the Dexorder platform. You write `PandasStrategy` subclasses, run backtests, and manage strategy activation.
+
+---
+
+## Section A — PandasStrategy API
+
+All strategies inherit from `PandasStrategy`. Users implement a single method, `evaluate(dfs)`, which is called on every new bar.
+
+### Class structure
+
+```python
+from dexorder.nautilus.pandas_strategy import PandasStrategy, PandasStrategyConfig
+
+class MyStrategy(PandasStrategy):
+
+    def evaluate(self, dfs: dict[str, pd.DataFrame]) -> None:
+        """
+        Called after every new bar across all feeds.
+
+        Args:
+            dfs: dict mapping feed_key → pd.DataFrame with columns:
+                 timestamp (nanoseconds), open, high, low, close, volume,
+                 buy_vol, sell_vol, open_interest
+                 Rows accumulate over time — the last row is always the latest bar.
+        """
+        df = dfs.get("BTC/USDT.BINANCE:300")
+        if df is None or len(df) < 20:
+            return  # Not enough data yet
+
+        close = df["close"]
+        # ... compute signals ...
+
+        if buy_signal:
+            self.buy(quantity=0.1)
+        elif sell_signal:
+            self.sell(quantity=0.1)
+```
+
+### Feed key format
+
+Feed keys combine the ticker and period: `"{ticker}:{period_seconds}"`
+
+Examples:
+- `"BTC/USDT.BINANCE:300"` — BTC/USDT on Binance, 5-minute bars
+- `"BTC/USDT.BINANCE:900"` — BTC/USDT on Binance, 15-minute bars
+- `"BTC/USDT.BINANCE:3600"` — BTC/USDT on Binance, 1-hour bars
+- `"ETH/USDT.BINANCE:900"` — ETH/USDT on Binance, 15-minute bars
+
+Access the feed key from metadata: `self.config.feed_keys` is a tuple of all feed keys.
+
+### Order API
+
+```python
+self.buy(quantity: float, feed_key: str = None)
+self.sell(quantity: float, feed_key: str = None)
+self.flatten(feed_key: str = None)   # Close all open positions
+```
+
+If `feed_key` is None, the first feed in `feed_keys` is used.
+
+`quantity` is in base currency units (e.g. 0.1 BTC). Use `self.config.initial_capital` to size appropriately.
+
+### Configuration available inside evaluate()
+
+```python
+self.config.feed_keys           # tuple of feed key strings
+self.config.initial_capital     # starting capital in quote currency
+```
+
+### DataFrame columns
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `timestamp` | int64 (ns) | Bar open time in nanoseconds |
+| `open` | float | Open price |
+| `high` | float | High price |
+| `low` | float | Low price |
+| `close` | float | Close price |
+| `volume` | float | Total volume |
+| `buy_vol` | float | Buy-side volume (taker buys) |
+| `sell_vol` | float | Sell-side volume (taker sells) |
+| `open_interest` | float | Open interest (futures only; NaN for spot) |
+
+---
+
+## Section B — Strategy Metadata
+
+When writing a strategy with `python_write(category="strategy", ...)`, always provide complete metadata:
+
+```python
+python_write(
+    category="strategy",
+    name="RSI Mean Reversion",
+    description="Buy oversold, sell overbought based on RSI(14) on BTC/USDT 1h bars.",
+    code="""...""",
+    metadata={
+        "data_feeds": [
+            {"symbol": "BTC/USDT.BINANCE", "period_seconds": 300, "description": "Primary BTC/USDT 5m feed"}
+        ],
+        "parameters": {
+            "rsi_length": {"default": 14, "description": "RSI lookback period"},
+            "oversold": {"default": 30, "description": "RSI oversold threshold"},
+            "overbought": {"default": 70, "description": "RSI overbought threshold"},
+            "trade_qty": {"default": 0.01, "description": "Trade quantity in BTC"}
+        },
+        "conda_packages": []
+    }
+)
+```
+
+### Metadata fields
+
+| Field | Required | Description |
+|-------|----------|-------------|
+| `data_feeds` | yes | List of `{symbol, period_seconds, description}` — one per feed the strategy needs |
+| `parameters` | yes | Dict of `{param_name: {default, description}}` for user-configurable values |
+| `conda_packages` | no | Extra Python packages to install |
+
+---
+
+## Section C — Custom Indicators in Strategies
+
+**Prefer using custom indicators defined in the `indicator` category rather than computing signals inline.**
+
+Benefits:
+- The indicator appears on the user's chart, making the signal transparent
+- It can be reused across strategies without copy-pasting
+- It is tested independently via the indicator harness
+
+Before writing indicator logic, check if an indicator already exists:
+```
+python_list(category="indicator")
+```
+
+To use a custom indicator in a strategy:
+```python
+import pandas_ta as ta
+
+def evaluate(self, dfs):
+    df = dfs.get("BTC/USDT.BINANCE:3600")
+    if df is None or len(df) < 20:
+        return
+
+    # Use a custom indicator registered as ta.custom_vw_rsi
+    vw_rsi = ta.custom_vw_rsi(df["close"], df["volume"], length=14)
+
+    if vw_rsi.iloc[-1] < 30:
+        self.buy(0.01)
+    elif vw_rsi.iloc[-1] > 70:
+        self.sell(0.01)
+```
+
+Custom indicator names follow the pattern `ta.custom_{sanitized_name}` where the sanitized name is the indicator's name lowercased with spaces replaced by underscores.
+
+**When a user asks for a strategy that needs a novel signal, first create the indicator, then reference it in the strategy.**
+
+---
+
+## Section D — Complete Strategy Examples
+
+### Example 1: RSI Mean Reversion (simple, single feed)
+
+```python
+import pandas as pd
+import pandas_ta as ta
+
+class RSIMeanReversion(PandasStrategy):
+
+    def evaluate(self, dfs: dict[str, pd.DataFrame]) -> None:
+        df = dfs.get("BTC/USDT.BINANCE:300")
+        if df is None or len(df) < 30:
+            return
+
+        rsi = ta.rsi(df["close"], length=14)
+        if rsi is None or rsi.isna().all():
+            return
+
+        last_rsi = rsi.iloc[-1]
+        trade_qty = 0.001 * self.config.initial_capital / df["close"].iloc[-1]
+
+        if last_rsi < 30:
+            self.buy(trade_qty)
+        elif last_rsi > 70:
+            self.sell(trade_qty)
+```
+
+Metadata:
+```python
+{
+    "data_feeds": [{"symbol": "BTC/USDT.BINANCE", "period_seconds": 300, "description": "BTC/USDT 5m"}],
+    "parameters": {
+        "rsi_length": {"default": 14, "description": "RSI period"},
+        "oversold":   {"default": 30, "description": "Buy threshold"},
+        "overbought": {"default": 70, "description": "Sell threshold"}
+    },
+    "conda_packages": []
+}
+```
+
+### Example 2: MACD Momentum (multi-feed dual timeframe)
+
+```python
+import pandas as pd
+import pandas_ta as ta
+
+class MACDMomentum(PandasStrategy):
+
+    def evaluate(self, dfs: dict[str, pd.DataFrame]) -> None:
+        df_15m = dfs.get("BTC/USDT.BINANCE:900")
+        df_4h = dfs.get("BTC/USDT.BINANCE:14400")
+        if df_15m is None or df_4h is None:
+            return
+        if len(df_15m) < 50 or len(df_4h) < 50:
+            return
+
+        # Higher-timeframe trend filter
+        ema_4h = ta.ema(df_4h["close"], length=20)
+        bullish_trend = df_4h["close"].iloc[-1] > ema_4h.iloc[-1]
+
+        # Entry signal on 15m
+        macd_df = ta.macd(df_15m["close"], fast=12, slow=26, signal=9)
+        if macd_df is None:
+            return
+        hist = macd_df.iloc[:, 2]  # histogram
+
+        trade_qty = 0.002 * self.config.initial_capital / df_15m["close"].iloc[-1]
+
+        if bullish_trend and hist.iloc[-1] > 0 and hist.iloc[-2] <= 0:
+            self.buy(trade_qty, feed_key="BTC/USDT.BINANCE:900")
+        elif hist.iloc[-1] < 0 and hist.iloc[-2] >= 0:
+            self.flatten()
+```
+
+Metadata:
+```python
+{
+    "data_feeds": [
+        {"symbol": "BTC/USDT.BINANCE", "period_seconds": 900,   "description": "BTC/USDT 15m entry"},
+        {"symbol": "BTC/USDT.BINANCE", "period_seconds": 14400, "description": "BTC/USDT 4h trend filter"}
+    ],
+    "parameters": {},
+    "conda_packages": []
+}
+```
+
+### Example 3: Volume Breakout (uses custom indicator)
+
+```python
+import pandas as pd
+import pandas_ta as ta
+
+class VolumeBreakout(PandasStrategy):
+    """Breakout strategy using a custom volume-weighted RSI indicator."""
+
+    def evaluate(self, dfs: dict[str, pd.DataFrame]) -> None:
+        df = dfs.get("ETH/USDT.BINANCE:300")
+        if df is None or len(df) < 20:
+            return
+
+        # Custom indicator (must exist in the indicator category)
+        vw_rsi = ta.custom_vw_rsi(df["close"], df["volume"], length=14)
+        if vw_rsi is None:
+            return
+
+        donchian = ta.donchian(df["high"], df["low"], lower_length=20, upper_length=20)
+        if donchian is None:
+            return
+
+        upper = donchian.iloc[:, 0]
+        close = df["close"]
+        qty = 0.01 * self.config.initial_capital / close.iloc[-1]
+
+        if close.iloc[-1] > upper.iloc[-2] and vw_rsi.iloc[-1] > 60:
+            self.buy(qty)
+        elif close.iloc[-1] < donchian.iloc[:, 1].iloc[-1]:
+            self.flatten()
+```
+
+---
+
+## Section E — Workflow
+
+### Writing and validating a strategy
+
+1. **Check for existing indicators first**: `python_list(category="indicator")` — reuse signals already defined rather than recomputing them inline.
+
+2. **Write the strategy**:
+   ```
+   python_write(category="strategy", name="...", description="...", code="...", metadata={...})
+   ```
+   After writing, the system automatically runs the strategy against synthetic data. If validation fails, fix the reported error before proceeding.
+
+3. **Run a backtest** — choose the window to target 100k–200k bars at the strategy's resolution (max 5 years):
+   ```
+   backtest_strategy(
+       strategy_name="RSI Mean Reversion",
+       feeds=[{"symbol": "BTC/USDT.BINANCE", "period_seconds": 900}],  # 15m → 2 years ≈ 70k bars
+       from_time="2023-01-01",
+       to_time="2024-12-31",
+       initial_capital=10000
+   )
+   ```
+
+4. **Interpret results**:
+   - `summary.total_return` — total fractional return (0.15 = +15%)
+   - `summary.sharpe_ratio` — annualized Sharpe (>1.0 good, >2.0 excellent)
+   - `summary.max_drawdown` — maximum peak-to-trough loss (0.20 = 20%)
+   - `summary.win_rate` — fraction of trades profitable
+   - `statistics.profit_factor` — gross profit / gross loss (>1.5 good)
+   - `statistics.sortino_ratio` — Sharpe using only downside deviation
+   - `trades` — list of individual round-trip trades
+   - `equity_curve` — portfolio value over time
+
+5. **Iterate**: edit with `python_edit`, re-run backtest, compare results. Use `get_backtest_results` to compare multiple runs.
+
+6. **Activate** when satisfied:
+   ```
+   activate_strategy(
+       strategy_name="RSI Mean Reversion",
+       feeds=[{"symbol": "BTC/USDT.BINANCE", "period_seconds": 900}],
+       allocation=5000.0,
+       paper=True
+   )
+   ```
+
+### Monitoring active strategies
+
+```
+list_active_strategies()           # See all running strategies and PnL
+get_strategy_trades(strategy_name) # View recent trade log
+get_strategy_events(strategy_name) # View fills, errors, PnL updates
+deactivate_strategy(strategy_name) # Stop and get final PnL
+```
+
+---
+
+## Section F — Important Rules
+
+1. **Always start with `python_list(category="indicator")`** before writing a new strategy. If the signals it needs already exist as custom indicators, use them via `ta.custom_*` rather than duplicating the computation.
+
+2. **Wait for validation output** after `python_write` or `python_edit`. If the harness reports an error, fix it before running a backtest.
+
+3. **Size positions conservatively** based on `self.config.initial_capital`. A typical trade quantity is `0.001–0.01 * initial_capital / price`.
+
+4. **Guard for insufficient data**: always check `len(df) >= min_required` before computing indicators that need a lookback period.
+
+5. **Multi-feed strategies**: access each feed by its exact feed key. Missing feeds (not yet warmed up) will be absent from `dfs` — always use `.get()` and check for `None`.
+
+6. **Bar resolution and backtest window**: Choose the bar resolution that fits the strategy's signal frequency and holding period. Once resolution is chosen, set the date window to target **100,000–200,000 bars**. **Never request more than 5 years of data.** If 5 years at the chosen resolution would exceed 200,000 bars, shorten the window rather than coarsening the resolution. Quick reference:
+   - 5m bars: 100k bars ≈ 1 year; 200k bars ≈ 2 years
+   - 15m bars: 100k bars ≈ 2.9 years; 200k bars ≈ 5 years (at limit)
+   - 1h bars: 100k bars ≈ 11.4 years → cap at 5 years (≈ 43,800 bars)
+   - 4h bars: 100k bars ≈ 45 years → cap at 5 years (≈ 10,950 bars)
+
+7. **Never `import` from `dexorder` inside `evaluate()`** — the strategy file is exec'd in a sandbox with PandasStrategy and pandas_ta pre-loaded. Standard library and pandas/numpy/pandas_ta are available.
--- a/gateway/src/harness/subagents/web-explore/index.ts
+++ b/gateway/src/harness/subagents/web-explore/index.ts
@@ -3,6 +3,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { SystemMessage } from '@langchain/core/messages';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import type { FastifyBaseLogger } from 'fastify';
+import type { HarnessEvent } from '../../harness-events.js';

 /**
 * Web Explore Subagent
@@ -66,6 +67,52 @@ export class WebExploreSubagent extends BaseSubagent {

    return finalText;
  }
+
+  async *streamEvents(context: SubagentContext, instruction: string, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
+    this.logger.info({ subagent: this.getName() }, 'streamEvents starting');
+
+    const initialMessages = this.buildMessages(context, instruction);
+    const systemMessage = initialMessages[0];
+    const humanMessage = initialMessages[initialMessages.length - 1];
+
+    const agent = createReactAgent({
+      llm: this.model,
+      tools: this.tools,
+      prompt: systemMessage as SystemMessage,
+    });
+
+    const stream = agent.stream(
+      { messages: [humanMessage] },
+      { streamMode: ['messages', 'updates'], recursionLimit: 15, signal }
+    );
+
+    let finalText = '';
+
+    for await (const [mode, data] of await stream) {
+      if (signal?.aborted) break;
+      if (mode === 'messages') {
+        for (const chunk of WebExploreSubagent.extractStreamChunks(data, this.config.name)) {
+          yield chunk;
+        }
+      } else if (mode === 'updates') {
+        if ((data as any).agent?.messages) {
+          for (const msg of (data as any).agent.messages as any[]) {
+            if (msg.tool_calls?.length) {
+              for (const tc of msg.tool_calls) {
+                yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: tc.name, label: tc.name };
+              }
+            } else {
+              const content = WebExploreSubagent.extractFinalText(msg);
+              if (content) finalText = content;
+            }
+          }
+        }
+      }
+    }
+
+    this.logger.info({ textLength: finalText.length }, 'streamEvents finished');
+    return finalText;
+  }
 }

 /**
--- a/gateway/src/main.ts
+++ b/gateway/src/main.ts
@@ -16,6 +16,8 @@ import { ContainerManager } from './k8s/container-manager.js';
 import { ZMQRelayClient } from './clients/zmq-relay-client.js';
 import { IcebergClient } from './clients/iceberg-client.js';
 import { ConversationStore } from './harness/memory/conversation-store.js';
+import { BlobStore } from './harness/memory/blob-store.js';
+import { ConversationService } from './services/conversation-service.js';
 import { AgentHarness, type HarnessSessionConfig } from './harness/agent-harness.js';
 import { OHLCService } from './services/ohlc-service.js';
 import { SymbolIndexService } from './services/symbol-index-service.js';
@@ -369,12 +371,17 @@ try {
 const conversationStore = new ConversationStore(redis, app.log, icebergClient);
 app.log.debug('Conversation store initialized');

+const blobStore = new BlobStore(icebergClient, app.log);
+const conversationService = new ConversationService(conversationStore, blobStore, app.log);
+app.log.debug('Blob store and conversation service initialized');
+
 // Harness factory: captures infrastructure deps; channel handlers stay infrastructure-free
 function createHarness(sessionConfig: HarnessSessionConfig): AgentHarness {
  return new AgentHarness({
    ...sessionConfig,
    providerConfig: config.providerConfig,
    conversationStore,
+    blobStore,
    historyLimit: config.conversationHistoryLimit,
  });
 }
@@ -391,6 +398,7 @@ const websocketHandler = new WebSocketHandler({
  createHarness,
  ohlcService,  // Optional
  symbolIndexService,  // Optional
+  conversationService,  // Optional - for history replay on reconnect
 });
 app.log.debug('WebSocket handler initialized');

@@ -614,6 +622,19 @@ try {
      mcpTools: [],
    });

+    // Strategy subagent: all strategy-related MCP tools
+    toolRegistry.registerAgentTools({
+      agentName: 'strategy',
+      platformTools: [],
+      mcpTools: [
+        'python_write', 'python_edit', 'python_read', 'python_list',
+        'python_log', 'python_revert',
+        'backtest_strategy', 'activate_strategy', 'deactivate_strategy',
+        'list_active_strategies', 'get_backtest_results',
+        'get_strategy_trades', 'get_strategy_events',
+      ],
+    });
+
    app.log.info(
      {
        agents: toolRegistry.getRegisteredAgents(),
--- a/gateway/src/services/conversation-service.ts
+++ b/gateway/src/services/conversation-service.ts
@@ -0,0 +1,59 @@
+import type { FastifyBaseLogger } from 'fastify';
+import type { ConversationStore } from '../harness/memory/conversation-store.js';
+import type { BlobStore, StoredBlob } from '../harness/memory/blob-store.js';
+
+export interface EnrichedMessage {
+  id: string;
+  userId: string;
+  sessionId: string;
+  role: 'user' | 'assistant';
+  content: string;
+  timestamp: number; // microseconds
+  files: StoredBlob[];
+}
+
+/**
+ * Generic conversation history service.
+ *
+ * Combines text messages (ConversationStore) with binary blobs (BlobStore)
+ * into enriched message records. Used by:
+ * - WebSocket handler: replay history on reconnect
+ * - Future admin panel: conversation browser
+ */
+export class ConversationService {
+  constructor(
+    private conversationStore: ConversationStore,
+    private blobStore: BlobStore,
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    _logger: FastifyBaseLogger
+  ) {}
+
+  async getHistory(
+    userId: string,
+    sessionId: string,
+    limit = 50,
+    channelType = 'websocket'
+  ): Promise<EnrichedMessage[]> {
+    const messages = await this.conversationStore.getFullHistory(userId, sessionId, limit, channelType);
+    const chatMessages = messages.filter(m => m.role === 'user' || m.role === 'assistant');
+
+    return Promise.all(
+      chatMessages.map(async (m) => {
+        const blobRefs = m.metadata?.blobs as Array<{ id: string; mimeType: string; caption?: string }> | undefined;
+        const files = blobRefs?.length
+          ? await this.blobStore.getBlobsByIds(userId, sessionId, blobRefs.map(b => b.id))
+          : [];
+
+        return {
+          id: m.id,
+          userId: m.userId,
+          sessionId: m.sessionId,
+          role: m.role as 'user' | 'assistant',
+          content: m.content,
+          timestamp: m.timestamp,
+          files,
+        };
+      })
+    );
+  }
+}
--- a/gateway/src/services/ohlc-service.ts
+++ b/gateway/src/services/ohlc-service.ts
@@ -16,7 +16,8 @@

 import type { FastifyBaseLogger } from 'fastify';
 import type { IcebergClient } from '../clients/iceberg-client.js';
-import type { ZMQRelayClient } from '../clients/zmq-relay-client.js';
+import type { ZMQRelayClient, BarUpdateCallback } from '../clients/zmq-relay-client.js';
+export type { BarUpdateCallback } from '../clients/zmq-relay-client.js';
 import type {
  HistoryResult,
  SymbolInfo,
@@ -53,6 +54,23 @@ export class OHLCService {
    this.logger = config.logger;
  }

+  /**
+   * Subscribe to realtime OHLC bar updates for a ticker+period.
+   * ZMQ subscribe is issued on the first call for a given topic; subsequent calls
+   * for the same topic only add the callback (no extra ZMQ events).
+   */
+  subscribeToTicker(ticker: string, periodSeconds: number, callback: BarUpdateCallback): void {
+    this.relayClient.subscribeToTicker(ticker, periodSeconds, callback);
+  }
+
+  /**
+   * Unsubscribe a callback from realtime OHLC bar updates.
+   * ZMQ unsubscribe is issued when the last callback for a topic is removed.
+   */
+  unsubscribeFromTicker(ticker: string, periodSeconds: number, callback: BarUpdateCallback): void {
+    this.relayClient.unsubscribeFromTicker(ticker, periodSeconds, callback);
+  }
+
  /**
   * Fetch OHLC data with smart caching
   *
--- a/gateway/src/tools/platform/get-chart-data.tool.ts
+++ b/gateway/src/tools/platform/get-chart-data.tool.ts
@@ -28,23 +28,29 @@ export function createGetChartDataTool(config: GetChartDataToolConfig): DynamicS

 **IMPORTANT: Use this tool ONLY for quick, casual data viewing. For any analysis, plotting, statistics, or deep research, use the 'research' tool instead.**

+**Hard limit: returns at most 500 bars (the most recent 500). This tool is not suitable for analysis requiring longer sequences — use the 'research' tool for that.**
+
 Parameters:
- ticker (optional): Market symbol (defaults to workspace chartState.symbol)
+- ticker (optional): Market symbol in SYMBOL.EXCHANGE format, e.g. "BTC/USDT.BINANCE" (defaults to workspace chartState.symbol)
 - period (optional): OHLC period in seconds (defaults to workspace chartState.period)
 - from_time (optional): Start time as Unix timestamp (number or string like "1774126800") OR date string like "2 days ago", "2024-01-01" (defaults to workspace chartState.start_time)
 - to_time (optional): End time as Unix timestamp (number or string like "1774732500") OR date string like "now", "yesterday" (defaults to workspace chartState.end_time)
- countback (optional): Limit number of bars returned
+- countback (optional): Limit number of bars returned (max 500)
 - columns (optional): Extra columns beyond OHLC: ["volume", "buy_vol", "sell_vol", "open_time", "high_time", "low_time", "close_time", "open_interest"]`,
    schema: z.object({
      ticker: z.string().optional().describe('Market symbol (defaults to workspace chartState.symbol)'),
      period: z.number().optional().describe('OHLC period in seconds (defaults to workspace chartState.period)'),
      from_time: z.union([z.number(), z.string()]).optional().describe('Start time: Unix seconds OR date string (defaults to workspace chartState.start_time)'),
      to_time: z.union([z.number(), z.string()]).optional().describe('End time: Unix seconds OR date string (defaults to workspace chartState.end_time)'),
-      countback: z.number().optional().describe('Limit number of bars returned'),
+      countback: z.number().optional().describe('Limit number of bars returned (max 500)'),
      columns: z.array(z.enum(['volume', 'buy_vol', 'sell_vol', 'open_time', 'high_time', 'low_time', 'close_time', 'open_interest'])).optional().describe('Extra columns beyond OHLC'),
    }),
    func: async ({ ticker, period, from_time, to_time, countback, columns }) => {
-      logger.debug({ ticker, period, from_time, to_time, countback, columns }, 'Executing get_chart_data tool');
+      const MAX_BARS = 500;
+      // Enforce hard cap — never return more than MAX_BARS bars
+      const effectiveCountback = countback !== undefined ? Math.min(countback, MAX_BARS) : MAX_BARS;
+
+      logger.debug({ ticker, period, from_time, to_time, countback: effectiveCountback, columns }, 'Executing get_chart_data tool');

      try {
        // Get workspace chart state
@@ -86,7 +92,7 @@ Parameters:
          finalPeriod,
          finalFromTime,
          finalToTime,
-          countback
+          effectiveCountback
        );

        if (historyResult.noData || !historyResult.bars || historyResult.bars.length === 0) {
@@ -98,8 +104,13 @@ Parameters:
          });
        }

+        // Enforce hard cap — keep the most recent bars
+        const sourceBars = historyResult.bars.length > MAX_BARS
+          ? historyResult.bars.slice(-MAX_BARS)
+          : historyResult.bars;
+
        // Filter/format bars with requested columns
-        const bars = historyResult.bars.map(bar => {
+        const bars = sourceBars.map(bar => {
          const result: any = {
            time: bar.time,
            open: bar.open,
--- a/gateway/src/tools/platform/indicator-agent.tool.ts
+++ b/gateway/src/tools/platform/indicator-agent.tool.ts
@@ -3,6 +3,7 @@ import { z } from 'zod';
 import type { FastifyBaseLogger } from 'fastify';
 import type { IndicatorSubagent } from '../../harness/subagents/indicator/index.js';
 import type { SubagentContext } from '../../harness/subagents/base-subagent.js';
+import type { HarnessEvent } from '../../harness/harness-events.js';

 export interface IndicatorAgentToolConfig {
  indicatorSubagent: IndicatorSubagent;
@@ -14,10 +15,20 @@ export interface IndicatorAgentToolConfig {
 * Creates a LangChain tool that delegates to the indicator subagent.
 * Mirrors the pattern of research-agent.tool.ts.
 */
-export function createIndicatorAgentTool(config: IndicatorAgentToolConfig): DynamicStructuredTool {
+export function createIndicatorAgentTool(config: IndicatorAgentToolConfig): DynamicStructuredTool & { streamFunc: (args: { instruction: string }) => AsyncGenerator<HarnessEvent, string> } {
  const { indicatorSubagent, context, logger } = config;

-  return new DynamicStructuredTool({
+  async function* streamFunc({ instruction }: { instruction: string }, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
+    logger.info({ instruction: instruction.substring(0, 100) }, 'Streaming indicator subagent');
+    const gen = indicatorSubagent.streamEvents(context, instruction, signal);
+    let step: IteratorResult<HarnessEvent, string>;
+    while (!(step = await gen.next()).done) {
+      yield step.value;
+    }
+    return step.value;
+  }
+
+  const tool = new DynamicStructuredTool({
    name: 'indicator',
    description: `Delegate to the indicator subagent for all indicator-related tasks on the chart.

@@ -50,4 +61,6 @@ NEVER modify the indicators workspace store directly.`,
      }
    },
  });
+
+  return Object.assign(tool, { streamFunc });
 }
--- a/gateway/src/tools/platform/research-agent.tool.ts
+++ b/gateway/src/tools/platform/research-agent.tool.ts
@@ -3,6 +3,7 @@ import { z } from 'zod';
 import type { FastifyBaseLogger } from 'fastify';
 import type { ResearchSubagent } from '../../harness/subagents/research/index.js';
 import type { SubagentContext } from '../../harness/subagents/base-subagent.js';
+import type { HarnessEvent } from '../../harness/harness-events.js';

 export interface ResearchAgentToolConfig {
  researchSubagent: ResearchSubagent;
@@ -15,10 +16,24 @@ export interface ResearchAgentToolConfig {
 * This is the standard LangChain pattern for exposing a subagent as a tool
 * to a parent agent.
 */
-export function createResearchAgentTool(config: ResearchAgentToolConfig): DynamicStructuredTool {
+export function createResearchAgentTool(config: ResearchAgentToolConfig): DynamicStructuredTool & { streamFunc: (args: { name: string; instruction: string }) => AsyncGenerator<HarnessEvent, string> } {
  const { researchSubagent, context, logger } = config;

-  return new DynamicStructuredTool({
+  const prompt = (name: string, instruction: string) => `Research script name: "${name}"\n\n${instruction}`;
+
+  async function* streamFunc({ name, instruction }: { name: string; instruction: string }, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
+    logger.info({ name, instruction: instruction.substring(0, 100) }, 'Streaming research subagent');
+    const gen = researchSubagent.streamEvents(context, prompt(name, instruction), signal);
+    let step: IteratorResult<HarnessEvent, string>;
+    while (!(step = await gen.next()).done) {
+      yield step.value;
+    }
+    const finalText = step.value;
+    const images = researchSubagent.getLastImages();
+    return JSON.stringify({ text: finalText, images });
+  }
+
+  const tool = new DynamicStructuredTool({
    name: 'research',
    description: `Delegate to the research subagent for data analysis, charting, statistics, and Python script execution.

@@ -36,21 +51,15 @@ The research subagent will write and execute Python scripts, capture output and
    func: async ({ name, instruction }: { name: string; instruction: string }): Promise<string> => {
      logger.info({ name, instruction: instruction.substring(0, 100) }, 'Delegating to research subagent');

-      const prompt = `Research script name: "${name}"\n\n${instruction}`;
-
      try {
-        const result = await researchSubagent.executeWithImages(context, prompt);
-
-        // Return in the format that AgentHarness.processToolResult() knows how to handle
-        // (extracts images and passes them to channelAdapter)
-        return JSON.stringify({
-          text: result.text,
-          images: result.images,
-        });
+        const result = await researchSubagent.executeWithImages(context, prompt(name, instruction));
+        return JSON.stringify({ text: result.text, images: result.images });
      } catch (error) {
        logger.error({ error, errorMessage: (error as Error)?.message }, 'Research subagent failed');
        throw error;
      }
    },
  });
+
+  return Object.assign(tool, { streamFunc });
 }
--- a/gateway/src/tools/platform/strategy-agent.tool.ts
+++ b/gateway/src/tools/platform/strategy-agent.tool.ts
@@ -0,0 +1,66 @@
+import { DynamicStructuredTool } from '@langchain/core/tools';
+import { z } from 'zod';
+import type { FastifyBaseLogger } from 'fastify';
+import type { StrategySubagent } from '../../harness/subagents/strategy/index.js';
+import type { SubagentContext } from '../../harness/subagents/base-subagent.js';
+import type { HarnessEvent } from '../../harness/harness-events.js';
+
+export interface StrategyAgentToolConfig {
+  strategySubagent: StrategySubagent;
+  context: SubagentContext;
+  logger: FastifyBaseLogger;
+}
+
+/**
+ * Creates a LangChain tool that delegates to the strategy subagent.
+ * Mirrors the pattern of indicator-agent.tool.ts.
+ */
+export function createStrategyAgentTool(config: StrategyAgentToolConfig): DynamicStructuredTool & { streamFunc: (args: { instruction: string }, signal?: AbortSignal) => AsyncGenerator<HarnessEvent, string> } {
+  const { strategySubagent, context, logger } = config;
+
+  async function* streamFunc({ instruction }: { instruction: string }, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
+    logger.info({ instruction: instruction.substring(0, 100) }, 'Streaming strategy subagent');
+    const gen = strategySubagent.streamEvents(context, instruction, signal);
+    let step: IteratorResult<HarnessEvent, string>;
+    while (!(step = await gen.next()).done) {
+      yield step.value;
+    }
+    return step.value;
+  }
+
+  const tool = new DynamicStructuredTool({
+    name: 'strategy',
+    description: `Delegate to the strategy subagent for all trading strategy tasks.
+
+Use this tool for:
+- Writing new PandasStrategy classes ("create a strategy that...")
+- Editing or improving existing strategies
+- Running backtests on a strategy
+- Interpreting backtest results (Sharpe ratio, drawdown, trade list)
+- Activating or deactivating strategies for paper trading
+- Monitoring running strategy PnL and trade logs
+- Checking which strategies already exist
+
+ALWAYS use this tool for any request about trading strategies, backtesting, or strategy activation.
+NEVER write strategy Python code or call backtest_strategy directly — delegate here instead.`,
+    schema: z.object({
+      instruction: z.string().describe(
+        'The strategy task to perform. Be specific: include the strategy name, ' +
+        'desired signals (e.g. RSI < 30 = buy), timeframe, and symbol if known. ' +
+        'For backtest requests include the date range and starting capital.'
+      ),
+    }),
+    func: async ({ instruction }: { instruction: string }): Promise<string> => {
+      logger.info({ instruction: instruction.substring(0, 100) }, 'Delegating to strategy subagent');
+
+      try {
+        return await strategySubagent.execute(context, instruction);
+      } catch (error) {
+        logger.error({ error, errorMessage: (error as Error)?.message }, 'Strategy subagent failed');
+        throw error;
+      }
+    },
+  });
+
+  return Object.assign(tool, { streamFunc });
+}
--- a/gateway/src/tools/platform/web-explore-agent.tool.ts
+++ b/gateway/src/tools/platform/web-explore-agent.tool.ts
@@ -3,6 +3,7 @@ import { z } from 'zod';
 import type { FastifyBaseLogger } from 'fastify';
 import type { WebExploreSubagent } from '../../harness/subagents/web-explore/index.js';
 import type { SubagentContext } from '../../harness/subagents/base-subagent.js';
+import type { HarnessEvent } from '../../harness/harness-events.js';

 export interface WebExploreAgentToolConfig {
  webExploreSubagent: WebExploreSubagent;
@@ -14,10 +15,20 @@ export interface WebExploreAgentToolConfig {
 * Creates a LangChain tool that delegates to the web-explore subagent.
 * The subagent decides whether to use web search or arXiv based on the instruction.
 */
-export function createWebExploreAgentTool(config: WebExploreAgentToolConfig): DynamicStructuredTool {
+export function createWebExploreAgentTool(config: WebExploreAgentToolConfig): DynamicStructuredTool & { streamFunc: (args: { instruction: string }, signal?: AbortSignal) => AsyncGenerator<HarnessEvent, string> } {
  const { webExploreSubagent, context, logger } = config;

-  return new DynamicStructuredTool({
+  async function* streamFunc({ instruction }: { instruction: string }, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
+    logger.info({ instruction: instruction.substring(0, 100) }, 'Streaming web-explore subagent');
+    const gen = webExploreSubagent.streamEvents(context, instruction, signal);
+    let step: IteratorResult<HarnessEvent, string>;
+    while (!(step = await gen.next()).done) {
+      yield step.value;
+    }
+    return step.value;
+  }
+
+  const tool = new DynamicStructuredTool({
    name: 'web_explore',
    description: `Search the web or academic databases and return a summarized answer.

@@ -46,4 +57,6 @@ The subagent will search the web (or arXiv for academic queries), fetch relevant
      }
    },
  });
+
+  return Object.assign(tool, { streamFunc });
 }
--- a/gateway/src/workspace/types.ts
+++ b/gateway/src/workspace/types.ts
@@ -103,6 +103,16 @@ export const DEFAULT_STORES: StoreConfig[] = [
    persistent: true,
    initialState: () => ({}),
  },
+  {
+    name: 'strategy_types',
+    persistent: true,
+    initialState: () => ({}),
+  },
+  {
+    name: 'research_types',
+    persistent: true,
+    initialState: () => ({}),
+  },
  {
    name: 'channelState',
    persistent: false,
--- a/ingestor/src/index.js
+++ b/ingestor/src/index.js
@@ -47,24 +47,22 @@ function loadConfig() {
    logger.warn({ error: error.message }, 'Could not load secrets');
  }

-  // Merge config and secrets
  return {
    // Flink ZMQ endpoints
    flink_hostname: config.flink_hostname || 'localhost',
-    ingestor_work_port: config.ingestor_work_port || 5555,
-    ingestor_control_port: config.ingestor_control_port || 5556,
+    ingestor_broker_port: config.ingestor_broker_port || 5567,

    // Kafka configuration
    kafka_brokers: config.kafka_brokers || ['localhost:9092'],
-    kafka_topic: 'market-ohlc',
+    kafka_ohlc_topic: config.kafka_ohlc_topic || 'market-ohlc',
+    kafka_tick_topic: config.kafka_tick_topic || 'market-tick',

    // Worker configuration
-    max_concurrent: config.max_concurrent || 10,
    poll_interval_ms: config.poll_interval_ms || 10000,

    // Symbol metadata configuration
    supported_exchanges: config.supported_exchanges || ['binance', 'coinbase', 'kraken'],
-    symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000, // 6 hours
+    symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000,

    ...secrets
  };
@@ -76,11 +74,7 @@ class IngestorWorker {
    this.logger = logger;

    this.zmqClient = new ZmqClient(config, logger.child({ component: 'zmq' }));
-    this.kafkaProducer = new KafkaProducer(
-      config,
-      logger.child({ component: 'kafka' })
-    );
-    // Create metadata generator first so ccxtFetcher can use it
+    this.kafkaProducer = new KafkaProducer(config, logger.child({ component: 'kafka' }));
    this.metadataGenerator = new SymbolMetadataGenerator(
      config,
      this.kafkaProducer,
@@ -94,33 +88,27 @@ class IngestorWorker {
    this.realtimePoller = new RealtimePoller(
      this.ccxtFetcher,
      this.kafkaProducer,
+      this.zmqClient,
      logger.child({ component: 'poller' })
    );

-    // Track active requests
-    this.activeRequests = new Map();
-    this.isShutdown = false;
+    // jobId → active realtime subscription (for stop handling)
+    this.activeRealtime = new Set();

-    // Metadata generation interval
-    this.metadataIntervalMs = config.symbol_metadata_interval_ms;
+    this.isShutdown = false;
    this.metadataInterval = null;
  }

-  /**
-   * Start the ingestor worker
-   */
  async start() {
    this.logger.info('Starting CCXT ingestor worker');

-    // Connect to services
    await this.kafkaProducer.connect();
-    await this.zmqClient.connect();

-    // Start control message listener
-    this.zmqClient.startControlListener(msg => this.handleControlMessage(msg));
+    // Wire event callbacks before connecting so we don't miss early messages
+    this.zmqClient.onWorkAssign = req => this.handleWorkAssign(req);
+    this.zmqClient.onWorkStop = jobId => this.handleWorkStop(jobId);

-    // Start work loop
-    this.workLoop();
+    await this.zmqClient.connect(); // also sends WorkerReady

    // Generate symbol metadata on startup
    this.logger.info('Generating initial symbol metadata');
@@ -140,281 +128,126 @@ class IngestorWorker {
      } catch (error) {
        this.logger.error({ error: error.message }, 'Failed to generate periodic symbol metadata');
      }
-    }, this.metadataIntervalMs);
+    }, this.config.symbol_metadata_interval_ms);

    this.logger.info('Ingestor worker started successfully');
  }

  /**
-   * Main work loop - pull and process data requests
+   * Handle a WorkAssign message dispatched by Flink IngestorBroker.
+   * Called from the ZmqClient receive loop — do not block.
   */
-  async workLoop() {
-    while (!this.isShutdown) {
-      try {
-        // Check if we can handle more requests
-        if (this.activeRequests.size >= this.config.max_concurrent) {
-          await new Promise(resolve => setTimeout(resolve, 1000));
-          continue;
-        }
+  handleWorkAssign(request) {
+    const { jobId, requestId, type, ticker } = request;

-        // Pull next data request
-        const request = await this.zmqClient.pullDataRequest();
-        if (!request) {
-          continue;
-        }
+    this.logger.info({ jobId, requestId, type, ticker }, 'Received WorkAssign');

-        // Handle request asynchronously
-        this.handleDataRequest(request).catch(error => {
-          this.logger.error(
-            { error: error.message, requestId: request.requestId },
-            'Error handling data request'
-          );
-        });
-      } catch (error) {
-        if (!this.isShutdown) {
-          this.logger.error({ error: error.message }, 'Error in work loop');
-          await new Promise(resolve => setTimeout(resolve, 1000));
-        }
-      }
+    // HISTORICAL_OHLC = 0 (proto3 default, may appear as undefined or 'HISTORICAL_OHLC')
+    const isHistorical = !type || type === 'HISTORICAL_OHLC' || type === 0;
+    const isRealtime = type === 'REALTIME_TICKS' || type === 1;
+
+    if (isHistorical) {
+      this.handleHistoricalRequest(request).catch(err => {
+        this.logger.error({ jobId, requestId, error: err.message }, 'Unexpected error in historical handler');
+      });
+    } else if (isRealtime) {
+      this.handleRealtimeRequest(request);
+    } else {
+      this.logger.warn({ jobId, type }, 'Unknown request type — rejecting');
+      this.zmqClient.sendReject(jobId, `Unknown request type: ${type}`).catch(() => {});
    }
  }

  /**
-   * Handle a data request
+   * Handle WorkStop sent by Flink (e.g., all subscribers left).
   */
-  async handleDataRequest(request) {
-    const { requestId: request_id, type, ticker } = request;
-
-    this.logger.info({ request_id, type, ticker, fullRequest: request }, 'Handling data request');
-
-    this.activeRequests.set(request_id, request);
-
-    try {
-      // HISTORICAL_OHLC = 0 is the proto3 default and is omitted from the wire,
-      // so protobufjs decodes it as undefined. Treat undefined as HISTORICAL_OHLC.
-      const isHistorical = type === undefined || type === 'HISTORICAL_OHLC' || type === 0;
-      const isRealtime = type === 'REALTIME_TICKS' || type === 1;
-
-      if (isHistorical) {
-        await this.handleHistoricalRequest(request);
-      } else if (isRealtime) {
-        await this.handleRealtimeRequest(request);
-      } else {
-        this.logger.warn({ request_id, type, typeOf: typeof type, fullRequest: request }, 'Unknown request type');
-      }
-    } finally {
-      // For historical requests, remove from active requests when done
-      const isHistorical = type === undefined || type === 'HISTORICAL_OHLC' || type === 0;
-      if (isHistorical) {
-        this.activeRequests.delete(request_id);
-      }
-    }
+  handleWorkStop(jobId) {
+    this.logger.info({ jobId }, 'Received WorkStop — cancelling realtime subscription');
+    this.realtimePoller.cancelSubscription(jobId);
+    this.activeRealtime.delete(jobId);
+    // No WorkComplete needed — Flink sent the stop, it already knows
  }

  /**
-   * Handle historical OHLC request
-   * ASYNC ARCHITECTURE: No response sent back. Data written to Kafka only.
-   * Flink will process from Kafka, write to Iceberg, and publish notification.
+   * Fetch historical OHLC data and write to Kafka.
+   * Sends WorkComplete when done (success or error).
   */
  async handleHistoricalRequest(request) {
-    const { requestId: request_id, ticker, historical, clientId: client_id } = request;
-    const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical;
+    const { jobId, requestId, ticker, historical, clientId: client_id } = request;
+    const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical || {};

-    this.logger.info(
-      { request_id, ticker, period_seconds, client_id },
-      'Processing historical OHLC request (async mode - write to Kafka only)'
-    );
+    this.logger.info({ jobId, requestId, ticker, period_seconds }, 'Processing historical OHLC request');

    try {
-      // Fetch historical data from exchange
      const candles = await this.ccxtFetcher.fetchHistoricalOHLC(
-        ticker,
-        start_time,
-        end_time,
-        period_seconds,
-        limit
+        ticker, start_time, end_time, period_seconds, limit
      );

-      this.logger.info(
-        { request_id, ticker, count: candles.length },
-        'Fetched data from exchange'
-      );
+      this.logger.info({ jobId, requestId, ticker, count: candles.length }, 'Fetched from exchange');

-      // Write to Kafka - THIS IS THE ONLY OUTPUT
-      // Flink will:
-      // 1. Read from Kafka
-      // 2. Write to Iceberg
-      // 3. Publish HistoryReadyNotification
-      // 4. Client receives notification via relay pub/sub
      if (candles.length > 0) {
-        // Add metadata to first candle for Flink tracking
-        const enrichedCandles = candles.map((candle, idx) => ({
-          ...candle,
-          __metadata: idx === 0 ? {
-            request_id,
-            client_id,
-            ticker,
-            period_seconds,
-            start_time,
-            end_time
-          } : undefined
-        }));
-
-        await this.kafkaProducer.writeOHLCs(this.config.kafka_topic, enrichedCandles);
-      } else {
-        // Write a marker message even if no data found
-        // Flink will see this and publish a NOT_FOUND notification
-        await this.kafkaProducer.writeMarker(this.config.kafka_topic, {
-          request_id,
-          client_id,
-          ticker,
-          period_seconds,
-          start_time,
-          end_time,
-          status: 'NOT_FOUND',
-          message: 'No data available for requested period'
-        });
-      }
-
-      this.logger.info(
-        { request_id, ticker, count: candles.length },
-        'Completed historical OHLC request - data written to Kafka'
-      );
-
-      // NO RESPONSE SENT - Relay is stateless, client waits for pub/sub notification
-
-    } catch (error) {
-      this.logger.error(
-        {
-          errorType: error.constructor?.name,
-          error: error.message,
-          errorUrl: error.url,
-          request_id,
-          ticker,
-          stack: error.stack
-        },
-        'Failed to process historical request'
-      );
-
-      // Write error marker to Kafka so Flink can notify client
-      try {
-        await this.kafkaProducer.writeMarker(this.config.kafka_topic, {
-          request_id,
-          client_id,
-          ticker,
-          period_seconds,
-          start_time,
-          end_time,
-          status: 'ERROR',
-          error_message: error.message
-        });
-      } catch (kafkaError) {
-        this.logger.error(
-          { error: kafkaError.message, request_id },
-          'Failed to write error marker to Kafka'
-        );
-      }
-
-      // Do not throw - request is handled, Flink will notify client of error
-    }
-  }
-
-  /**
-   * Handle realtime tick subscription request
-   */
-  async handleRealtimeRequest(request) {
-    const { requestId: request_id, ticker } = request;
-
-    this.logger.info(
-      { request_id, ticker },
-      'Processing realtime subscription request'
-    );
-
-    try {
-      // Start realtime polling
-      this.realtimePoller.startSubscription(
-        request_id,
-        ticker,
-        this.config.kafka_topic
-      );
-    } catch (error) {
-      this.logger.error(
-        { error: error.message, request_id, ticker },
-        'Failed to start realtime subscription'
-      );
-      this.activeRequests.delete(request_id);
-      throw error;
-    }
-  }
-
-  /**
-   * Handle control messages from Flink
-   */
-  async handleControlMessage(message) {
-    const { action, requestId: request_id } = message;
-
-    this.logger.info({ action, request_id }, 'Received control message');
-
-    switch (action) {
-      case 'CANCEL':
-        if (request_id) {
-          // Cancel specific request
-          this.realtimePoller.cancelSubscription(request_id);
-          this.activeRequests.delete(request_id);
+        const metadata = { request_id: requestId, client_id, ticker, period_seconds, start_time, end_time };
+        const PAGE_SIZE = 1000;
+        for (let i = 0; i < candles.length; i += PAGE_SIZE) {
+          const page = candles.slice(i, i + PAGE_SIZE);
+          const isLastPage = (i + PAGE_SIZE) >= candles.length;
+          await this.kafkaProducer.writeOHLCs(this.config.kafka_ohlc_topic, page, metadata, isLastPage);
        }
-        break;
+        this.logger.info({ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) }, 'Wrote all pages to Kafka');
+      } else {
+        await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
+          request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
+          status: 'NOT_FOUND', message: 'No data available for requested period'
+        });
+      }

-      case 'SHUTDOWN':
-        this.logger.info('Received shutdown signal');
-        await this.shutdown();
-        break;
+      this.logger.info({ jobId, requestId, ticker }, 'Historical request complete — sending WorkComplete');
+      await this.zmqClient.sendComplete(jobId, true);

-      case 'CONFIG_UPDATE':
-        // Handle config update if needed
-        this.logger.info('Received config update');
-        break;
+    } catch (error) {
+      this.logger.error({ jobId, requestId, ticker, error: error.message }, 'Historical request failed');

-      case 'HEARTBEAT':
-        // Just acknowledge heartbeat
-        break;
+      try {
+        await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
+          request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
+          status: 'ERROR', error_message: error.message
+        });
+      } catch (kafkaErr) {
+        this.logger.error({ jobId, error: kafkaErr.message }, 'Failed to write error marker to Kafka');
+      }

-      default:
-        this.logger.warn({ action }, 'Unknown control action');
+      await this.zmqClient.sendComplete(jobId, false, error.message);
    }
  }

  /**
-   * Get worker status
+   * Start realtime tick polling for a job dispatched by Flink.
+   * Ticks flow: exchange → Kafka market-tick → Flink → OHLC bars → clients.
   */
+  handleRealtimeRequest(request) {
+    const { jobId, requestId, ticker } = request;
+    this.logger.info({ jobId, requestId, ticker }, 'Processing realtime subscription request');
+
+    this.activeRealtime.add(jobId);
+    this.realtimePoller.startSubscription(jobId, requestId, ticker, this.config.kafka_tick_topic);
+  }
+
  getStatus() {
    return {
-      activeRequests: this.activeRequests.size,
-      maxConcurrent: this.config.max_concurrent,
+      activeRealtime: this.activeRealtime.size,
      pollerStats: this.realtimePoller.getStats(),
      metadataStatus: this.metadataGenerator.getStatus()
    };
  }

-  /**
-   * Shutdown worker gracefully
-   */
  async shutdown() {
-    if (this.isShutdown) {
-      return;
-    }
-
+    if (this.isShutdown) return;
    this.isShutdown = true;
    this.logger.info('Shutting down ingestor worker');

-    // Stop metadata generation interval
-    if (this.metadataInterval) {
-      clearInterval(this.metadataInterval);
-    }
+    if (this.metadataInterval) clearInterval(this.metadataInterval);

-    // Stop polling
    this.realtimePoller.shutdown();
-
-    // Close connections
    await this.ccxtFetcher.close();
    await this.metadataGenerator.close();
    await this.kafkaProducer.disconnect();
@@ -430,31 +263,23 @@ async function main() {
  const config = loadConfig();
  const worker = new IngestorWorker(config, logger);

-  // Handle shutdown signals
  process.on('SIGINT', () => worker.shutdown());
  process.on('SIGTERM', () => worker.shutdown());
-
-  // Handle errors
  process.on('uncaughtException', error => {
    logger.error({ error }, 'Uncaught exception');
    worker.shutdown();
  });
-
-  process.on('unhandledRejection', (reason, promise) => {
+  process.on('unhandledRejection', reason => {
    logger.error({ reason }, 'Unhandled rejection');
  });

-  // Start worker
  await worker.start();

-  // Log status periodically
  setInterval(() => {
-    const status = worker.getStatus();
-    logger.info({ status }, 'Worker status');
+    logger.info({ status: worker.getStatus() }, 'Worker status');
  }, 60000);
 }

-// Run
 main().catch(error => {
  logger.error({ error }, 'Fatal error');
  process.exit(1);
--- a/ingestor/src/kafka-producer.js
+++ b/ingestor/src/kafka-producer.js
@@ -116,12 +116,17 @@ export class KafkaProducer {
  }

  /**
-   * Write multiple OHLC candles to Kafka as an OHLCBatch message
-   * Uses protobuf encoding with metadata in batch wrapper
+   * Write multiple OHLC candles to Kafka as an OHLCBatch message.
+   *
+   * Historical mode: pass explicit metadata and isLastPage flag.
+   * Realtime mode: omit metadata (null/undefined) — writes individual OHLC messages instead.
+   *
   * @param {string} topic - Kafka topic name
-   * @param {Array<object>} ohlcData - Array of OHLC data objects (may include __metadata in first record)
+   * @param {Array<object>} ohlcData - Array of OHLC candle objects
+   * @param {object|null} metadata - Request metadata for historical batches; null for realtime
+   * @param {boolean} isLastPage - True if this is the final page of a historical query
   */
-  async writeOHLCs(topic, ohlcData) {
+  async writeOHLCs(topic, ohlcData, metadata = null, isLastPage = false) {
    if (!this.isConnected) {
      throw new Error('Kafka producer not connected');
    }
@@ -130,12 +135,8 @@ export class KafkaProducer {
      return;
    }

-    // Extract metadata from first record if present
-    const firstCandle = ohlcData[0];
-    const metadata = firstCandle.__metadata;
-
    if (!metadata) {
-      // No metadata - write individual OHLC messages (realtime mode)
+      // Realtime mode — write individual OHLC messages (no batch wrapper)
      const messages = ohlcData.map(candle => {
        const protoCandle = {
          timestamp: candle.timestamp,
@@ -156,10 +157,7 @@ export class KafkaProducer {
        };
      });

-      await this.producer.send({
-        topic,
-        messages
-      });
+      await this.producer.send({ topic, messages });

      this.logger.debug(
        { count: ohlcData.length, topic, type: 'individual' },
@@ -168,7 +166,7 @@ export class KafkaProducer {
      return;
    }

-    // Historical mode - write as OHLCBatch with metadata
+    // Historical mode — write as OHLCBatch with metadata
    const batch = {
      metadata: {
        requestId: metadata.request_id,
@@ -178,7 +176,8 @@ export class KafkaProducer {
        startTime: metadata.start_time,
        endTime: metadata.end_time,
        status: metadata.status || 'OK',
-        errorMessage: metadata.error_message
+        errorMessage: metadata.error_message,
+        isLastPage
      },
      rows: ohlcData.map(candle => {
        const row = {
@@ -194,22 +193,16 @@ export class KafkaProducer {
      })
    };

-    // Encode as protobuf OHLCBatch with ZMQ envelope
    const [frame1, frame2] = encodeMessage(MessageTypeId.OHLC_BATCH, batch, OHLCBatch);
    const value = Buffer.concat([frame1, frame2]);

    await this.producer.send({
      topic,
-      messages: [
-        {
-          key: metadata.ticker,
-          value
-        }
-      ]
+      messages: [{ key: metadata.ticker, value }]
    });

    this.logger.debug(
-      { request_id: metadata.request_id, count: ohlcData.length, topic, type: 'batch' },
+      { request_id: metadata.request_id, count: ohlcData.length, isLastPage, topic },
      'Wrote OHLCBatch to Kafka'
    );
  }
@@ -225,7 +218,8 @@ export class KafkaProducer {
      throw new Error('Kafka producer not connected');
    }

-    // Create an empty OHLCBatch with status in metadata
+    // Create an empty OHLCBatch with status in metadata.
+    // Markers are always the terminal message for a request (is_last_page = true).
    const batch = {
      metadata: {
        requestId: marker.request_id,
@@ -235,7 +229,8 @@ export class KafkaProducer {
        startTime: marker.start_time,
        endTime: marker.end_time,
        status: marker.status, // 'NOT_FOUND' or 'ERROR'
-        errorMessage: marker.error_message || marker.message
+        errorMessage: marker.error_message || marker.message,
+        isLastPage: true
      },
      rows: [] // Empty rows array indicates marker message
    };
--- a/ingestor/src/realtime-poller.js
+++ b/ingestor/src/realtime-poller.js
@@ -1,33 +1,40 @@
-// Realtime tick data poller using 10-second polling
+// Realtime tick data poller — polls exchange every 10s, writes ticks to market-tick Kafka topic.
+// Heartbeats every 5s so Flink IngestorBroker knows the job is alive.
 export class RealtimePoller {
-  constructor(ccxtFetcher, kafkaProducer, logger) {
+  constructor(ccxtFetcher, kafkaProducer, zmqClient, logger) {
    this.ccxtFetcher = ccxtFetcher;
    this.kafkaProducer = kafkaProducer;
+    this.zmqClient = zmqClient;
    this.logger = logger;

-    // Active subscriptions: requestId -> subscription info
+    // Active subscriptions: jobId -> subscription info
    this.subscriptions = new Map();

    // Poll interval in milliseconds (10 seconds)
    this.pollInterval = 10000;

-    // Main polling loop
+    // Heartbeat interval (5 seconds)
+    this.heartbeatInterval = 5000;
+
    this.pollingLoop = null;
+    this.heartbeatLoop = null;
  }

  /**
-   * Start a realtime subscription
-   * @param {string} requestId - Unique request ID
-   * @param {string} ticker - Ticker to subscribe to
-   * @param {string} kafkaTopic - Kafka topic to write to
+   * Start a realtime subscription for a job dispatched by IngestorBroker.
+   * @param {string} jobId    - Broker-assigned job ID (for heartbeats and COMPLETE)
+   * @param {string} requestId - Original request ID (for metadata)
+   * @param {string} ticker  - Ticker to subscribe to
+   * @param {string} kafkaTopic - Kafka topic to write ticks to (market-tick)
   */
-  startSubscription(requestId, ticker, kafkaTopic) {
-    if (this.subscriptions.has(requestId)) {
-      this.logger.warn({ requestId }, 'Subscription already exists');
+  startSubscription(jobId, requestId, ticker, kafkaTopic) {
+    if (this.subscriptions.has(jobId)) {
+      this.logger.warn({ jobId }, 'Subscription already exists');
      return;
    }

    const subscription = {
+      jobId,
      requestId,
      ticker,
      kafkaTopic,
@@ -36,93 +43,81 @@ export class RealtimePoller {
      errorCount: 0
    };

-    this.subscriptions.set(requestId, subscription);
+    this.subscriptions.set(jobId, subscription);
+    this.logger.info({ jobId, requestId, ticker, kafkaTopic }, 'Started realtime subscription');

-    this.logger.info(
-      { requestId, ticker, kafkaTopic },
-      'Started realtime subscription'
-    );
-
-    // Start polling loop if not already running
    if (!this.pollingLoop) {
      this.startPollingLoop();
    }
+    if (!this.heartbeatLoop) {
+      this.startHeartbeatLoop();
+    }
  }

  /**
-   * Cancel a realtime subscription
-   * @param {string} requestId - Request ID to cancel
+   * Stop a realtime subscription. Called when Flink sends WorkStop or on error.
+   * Does NOT send WorkComplete — caller is responsible for that.
   */
-  cancelSubscription(requestId) {
-    const subscription = this.subscriptions.get(requestId);
+  cancelSubscription(jobId) {
+    const subscription = this.subscriptions.get(jobId);
    if (subscription) {
      subscription.isActive = false;
-      this.subscriptions.delete(requestId);
-
-      this.logger.info(
-        { requestId, ticker: subscription.ticker },
-        'Cancelled realtime subscription'
-      );
+      this.subscriptions.delete(jobId);
+      this.logger.info({ jobId, ticker: subscription.ticker }, 'Cancelled realtime subscription');
    }

-    // Stop polling loop if no active subscriptions
-    if (this.subscriptions.size === 0 && this.pollingLoop) {
-      clearInterval(this.pollingLoop);
-      this.pollingLoop = null;
-      this.logger.info('Stopped polling loop - no active subscriptions');
+    if (this.subscriptions.size === 0) {
+      if (this.pollingLoop) {
+        clearInterval(this.pollingLoop);
+        this.pollingLoop = null;
+      }
+      if (this.heartbeatLoop) {
+        clearInterval(this.heartbeatLoop);
+        this.heartbeatLoop = null;
+      }
+      this.logger.info('Stopped polling/heartbeat loops — no active subscriptions');
    }
  }

-  /**
-   * Start the main polling loop
-   */
  startPollingLoop() {
    this.logger.info({ interval: this.pollInterval }, 'Starting polling loop');
-
-    this.pollingLoop = setInterval(async () => {
-      await this.pollAllSubscriptions();
-    }, this.pollInterval);
-
-    // Do an immediate poll
+    this.pollingLoop = setInterval(() => this.pollAllSubscriptions(), this.pollInterval);
+    // Immediate first poll
    this.pollAllSubscriptions();
  }

-  /**
-   * Poll all active subscriptions
-   */
-  async pollAllSubscriptions() {
-    const subscriptions = Array.from(this.subscriptions.values());
-
-    // Poll subscriptions in parallel
-    await Promise.allSettled(
-      subscriptions.map(sub => this.pollSubscription(sub))
-    );
+  startHeartbeatLoop() {
+    this.logger.info({ interval: this.heartbeatInterval }, 'Starting heartbeat loop');
+    this.heartbeatLoop = setInterval(async () => {
+      for (const { jobId } of this.subscriptions.values()) {
+        try {
+          await this.zmqClient.sendHeartbeat(jobId);
+        } catch (err) {
+          this.logger.error({ jobId, error: err.message }, 'Failed to send heartbeat');
+        }
+      }
+    }, this.heartbeatInterval);
  }

-  /**
-   * Poll a single subscription
-   * @param {object} subscription - Subscription object
-   */
-  async pollSubscription(subscription) {
-    if (!subscription.isActive) {
-      return;
-    }
+  async pollAllSubscriptions() {
+    const subscriptions = Array.from(this.subscriptions.values());
+    await Promise.allSettled(subscriptions.map(sub => this.pollSubscription(sub)));
+  }

-    const { requestId, ticker, kafkaTopic, lastTimestamp } = subscription;
+  async pollSubscription(subscription) {
+    if (!subscription.isActive) return;
+
+    const { jobId, requestId, ticker, kafkaTopic, lastTimestamp } = subscription;

    try {
-      // Fetch trades since last timestamp
-      const trades = await this.ccxtFetcher.fetchRecentTrades(
-        ticker,
-        lastTimestamp
-      );
+      const trades = await this.ccxtFetcher.fetchRecentTrades(ticker, lastTimestamp);

      if (trades.length === 0) {
-        this.logger.debug({ requestId, ticker }, 'No new trades');
+        this.logger.debug({ jobId, ticker }, 'No new trades');
        return;
      }

-      // Filter out trades we've already seen
+      // Skip trades we've already seen (timestamp-based dedup)
      let newTrades = trades;
      if (lastTimestamp) {
        const lastTs = BigInt(lastTimestamp);
@@ -130,88 +125,59 @@ export class RealtimePoller {
      }

      if (newTrades.length > 0) {
-        // Write trades to Kafka
        await this.kafkaProducer.writeTicks(kafkaTopic, newTrades);
-
-        // Update last timestamp
-        const latestTrade = newTrades[newTrades.length - 1];
-        subscription.lastTimestamp = latestTrade.timestamp;
-
-        this.logger.info(
-          {
-            requestId,
-            ticker,
-            count: newTrades.length,
-            kafkaTopic
-          },
-          'Wrote new trades to Kafka'
-        );
+        subscription.lastTimestamp = newTrades[newTrades.length - 1].timestamp;
+        this.logger.info({ jobId, ticker, count: newTrades.length, kafkaTopic }, 'Wrote ticks to Kafka');
      }

-      // Reset error count on success
      subscription.errorCount = 0;
    } catch (error) {
      subscription.errorCount++;
-
      this.logger.error(
-        {
-          error: error.message,
-          requestId,
-          ticker,
-          errorCount: subscription.errorCount
-        },
+        { error: error.message, jobId, ticker, errorCount: subscription.errorCount },
        'Error polling subscription'
      );

-      // Cancel subscription after too many errors
+      // After 5 consecutive errors, give up and notify Flink
      if (subscription.errorCount >= 5) {
-        this.logger.error(
-          { requestId, ticker },
-          'Cancelling subscription due to repeated errors'
-        );
-        this.cancelSubscription(requestId);
+        this.logger.error({ jobId, ticker }, 'Cancelling subscription due to repeated errors');
+        this.cancelSubscription(jobId);
+        try {
+          await this.zmqClient.sendComplete(jobId, false, `Polling failed after 5 errors: ${error.message}`);
+        } catch (zmqErr) {
+          this.logger.error({ jobId, error: zmqErr.message }, 'Failed to send WorkComplete after error');
+        }
      }
    }
  }

-  /**
-   * Get subscription statistics
-   */
  getStats() {
-    const stats = {
+    return {
      totalSubscriptions: this.subscriptions.size,
-      subscriptions: []
-    };
-
-    for (const [requestId, sub] of this.subscriptions) {
-      stats.subscriptions.push({
-        requestId,
+      subscriptions: Array.from(this.subscriptions.values()).map(sub => ({
+        jobId: sub.jobId,
+        requestId: sub.requestId,
        ticker: sub.ticker,
        isActive: sub.isActive,
        errorCount: sub.errorCount,
        lastTimestamp: sub.lastTimestamp
-      });
-    }
-
-    return stats;
+      }))
+    };
  }

-  /**
-   * Shutdown poller and cancel all subscriptions
-   */
  shutdown() {
    this.logger.info('Shutting down realtime poller');
-
    if (this.pollingLoop) {
      clearInterval(this.pollingLoop);
      this.pollingLoop = null;
    }
-
-    // Mark all subscriptions as inactive
+    if (this.heartbeatLoop) {
+      clearInterval(this.heartbeatLoop);
+      this.heartbeatLoop = null;
+    }
    for (const subscription of this.subscriptions.values()) {
      subscription.isActive = false;
    }
-
    this.subscriptions.clear();
  }
 }
--- a/ingestor/src/zmq-client.js
+++ b/ingestor/src/zmq-client.js
@@ -1,116 +1,204 @@
-// ZeroMQ client for connecting to Flink control channels
+// ZeroMQ DEALER client connecting to Flink IngestorBroker (ROUTER, port 5567)
 import * as zmq from 'zeromq';
-import { decodeMessage } from './proto/messages.js';
+import {
+  DataRequest,
+  WorkerReady, WorkComplete, WorkHeartbeat, WorkReject, WorkStop,
+  MessageTypeId, PROTOCOL_VERSION
+} from './proto/messages.js';
+
+const PROTOCOL_VERSION_BUF = Buffer.from([PROTOCOL_VERSION]);
+
+/**
+ * Encodes a broker protocol message for sending over DEALER.
+ * Frame layout (DEALER → ROUTER):
+ *   Frame 0: empty delimiter (required for ROUTER peering)
+ *   Frame 1: [0x01] version
+ *   Frame 2: [typeId][protobuf bytes]
+ */
+function encodeBrokerMessage(typeId, messageData, MessageType) {
+  const protoBytes = MessageType.encode(MessageType.create(messageData)).finish();
+  const frame2 = Buffer.concat([Buffer.from([typeId]), Buffer.from(protoBytes)]);
+  return [Buffer.alloc(0), PROTOCOL_VERSION_BUF, frame2];
+}

 export class ZmqClient {
  constructor(config, logger) {
    this.config = config;
    this.logger = logger;

-    // Work queue - SUB socket to receive data requests with exchange prefix filtering
-    this.workSocket = null;
-
-    // NOTE: NO RESPONSE SOCKET - Async architecture via Kafka!
-    // Ingestors write data to Kafka only
-    // Flink processes and publishes notifications
-
+    this.dealerSocket = null;
    this.isShutdown = false;
-    this.supportedExchanges = config.supported_exchanges || ['BINANCE', 'COINBASE'];
+    this.activeJobId = null;
+    this._idleHeartbeatInterval = null;
+
+    this.supportedExchanges = (config.supported_exchanges || ['BINANCE', 'COINBASE'])
+      .map(e => e.toUpperCase());
+
+    // Callbacks set by IngestorWorker
+    this.onWorkAssign = null; // (DataRequest) => void
+    this.onWorkStop = null;   // (jobId) => void
  }

  /**
-   * Connect to Relay ZMQ endpoints
+   * Connect DEALER socket to Flink IngestorBroker (ROUTER).
+   * Sends WorkerReady immediately so Flink knows this worker is available.
   */
  async connect() {
-    const { flink_hostname, ingestor_work_port } = this.config;
+    const { flink_hostname, ingestor_broker_port = 5567 } = this.config;

-    // Connect to work queue (SUB with exchange prefix filtering)
-    this.workSocket = new zmq.Subscriber();
-    const workEndpoint = `tcp://${flink_hostname}:${ingestor_work_port}`;
-    await this.workSocket.connect(workEndpoint);
+    this.dealerSocket = new zmq.Dealer();
+    const endpoint = `tcp://${flink_hostname}:${ingestor_broker_port}`;
+    await this.dealerSocket.connect(endpoint);
+    this.logger.info(`Connected DEALER to Flink IngestorBroker at ${endpoint}`);

-    // Subscribe to each supported exchange suffix (Nautilus format: "BTC/USDT.BINANCE")
-    for (const exchange of this.supportedExchanges) {
-      const prefix = `${exchange}.`;
-      this.workSocket.subscribe(prefix);
-      this.logger.info(`Subscribed to exchange prefix: ${prefix}`);
-    }
-    this.logger.info(`Connected to work queue at ${workEndpoint}`);
-    this.logger.info('ASYNC MODE: No response socket - data flows via Kafka → Flink → pub/sub notification');
+    // Register as available
+    await this.sendReady();
+
+    // Periodically re-send WorkerReady when idle, to recover from missed initial registration
+    this._idleHeartbeatInterval = setInterval(() => {
+      if (this.activeJobId === null && !this.isShutdown) {
+        this.sendReady().catch(err =>
+          this.logger.warn({ error: err.message }, 'Failed to re-send WorkerReady'));
+      }
+    }, 30_000);
+
+    // Start receiving work in background
+    this._receiveLoop();
  }

  /**
-   * Pull a data request from the work queue
-   * @returns {Promise<object>} Decoded DataRequest message
+   * Send WorkerReady — called on connect and after each COMPLETE.
   */
-  async pullDataRequest() {
-    if (this.isShutdown) {
-      return null;
-    }
+  async sendReady() {
+    const frames = encodeBrokerMessage(
+      MessageTypeId.WORKER_READY,
+      { exchanges: this.supportedExchanges },
+      WorkerReady
+    );
+    await this.dealerSocket.send(frames);
+    this.logger.info({ exchanges: this.supportedExchanges }, 'Sent WorkerReady');
+  }

+  /**
+   * Send WorkComplete after a historical job finishes.
+   * Automatically sends WorkerReady so Flink returns us to the free pool.
+   */
+  async sendComplete(jobId, success, errorMessage) {
+    this.activeJobId = null;
+    const frames = encodeBrokerMessage(
+      MessageTypeId.WORK_COMPLETE,
+      {
+        jobId,
+        success,
+        ...(errorMessage ? { errorMessage } : {})
+      },
+      WorkComplete
+    );
+    await this.dealerSocket.send(frames);
+    this.logger.info({ jobId, success }, 'Sent WorkComplete');
+
+    // Return to free pool
+    await this.sendReady();
+  }
+
+  /**
+   * Send WorkHeartbeat for an active realtime job.
+   */
+  async sendHeartbeat(jobId) {
+    const frames = encodeBrokerMessage(
+      MessageTypeId.WORK_HEARTBEAT,
+      { jobId },
+      WorkHeartbeat
+    );
+    await this.dealerSocket.send(frames);
+    this.logger.debug({ jobId }, 'Sent WorkHeartbeat');
+  }
+
+  /**
+   * Send WorkReject if we cannot handle the dispatched job.
+   */
+  async sendReject(jobId, reason) {
+    const frames = encodeBrokerMessage(
+      MessageTypeId.WORK_REJECT,
+      { jobId, reason },
+      WorkReject
+    );
+    await this.dealerSocket.send(frames);
+    this.logger.warn({ jobId, reason }, 'Sent WorkReject');
+  }
+
+  /**
+   * Background loop: receive WorkAssign (DataRequest) or WorkStop from Flink.
+   * ROUTER→DEALER frame layout: [empty][version][typeId+payload]
+   */
+  async _receiveLoop() {
    try {
-      const frames = await this.workSocket.receive();
-      this.logger.info({
-        frameCount: frames.length,
-        frame0Len: frames[0]?.length,
-        frame1Len: frames[1]?.length,
-        frame2Len: frames[2]?.length,
-        frame0: frames[0]?.toString('utf8').substring(0, 50),
-        frame1Hex: frames[1]?.toString('hex').substring(0, 20),
-        frame2Hex: frames[2]?.toString('hex').substring(0, 20)
-      }, 'Received raw ZMQ frames');
+      for await (const frames of this.dealerSocket) {
+        if (this.isShutdown) break;

-      // First frame is the topic (exchange prefix), skip it
-      // Remaining frames are: [version_frame, message_frame]
-      if (frames.length < 3) {
-        this.logger.warn({ frameCount: frames.length }, 'Unexpected frame count');
-        return null;
+        try {
+          // frames[0] = empty delimiter, frames[1] = version, frames[2] = type+payload
+          if (frames.length < 3) {
+            this.logger.warn({ frameCount: frames.length }, 'Unexpected frame count from broker');
+            continue;
+          }
+
+          const versionByte = frames[1][0];
+          if (versionByte !== PROTOCOL_VERSION) {
+            this.logger.warn({ versionByte }, 'Unexpected protocol version from broker');
+            continue;
+          }
+
+          const typeId = frames[2][0];
+          const payload = frames[2].slice(1);
+
+          if (typeId === MessageTypeId.WORK_ASSIGN) {
+            // DataRequest protobuf
+            const request = DataRequest.decode(payload);
+            const req = DataRequest.toObject(request, {
+              longs: String, enums: String, bytes: Buffer
+            });
+            this.activeJobId = req.jobId;
+            this.logger.info(
+              { jobId: req.jobId, requestId: req.requestId, type: req.type, ticker: req.ticker },
+              'Received WorkAssign from broker'
+            );
+            if (this.onWorkAssign) {
+              this.onWorkAssign(req);
+            }
+
+          } else if (typeId === MessageTypeId.WORK_STOP) {
+            const stop = WorkStop.decode(payload);
+            const { jobId } = WorkStop.toObject(stop);
+            this.logger.info({ jobId }, 'Received WorkStop from broker');
+            if (this.onWorkStop) {
+              this.onWorkStop(jobId);
+            }
+
+          } else {
+            this.logger.warn({ typeId: `0x${typeId.toString(16)}` }, 'Unknown message type from broker');
+          }
+
+        } catch (err) {
+          this.logger.error({ error: err.message }, 'Error processing broker message');
+        }
      }
-      const messageFrames = frames.slice(1); // Skip topic, keep version + message
-      const { version, typeId, message } = decodeMessage(messageFrames);
-      this.logger.info({
-        version,
-        typeId: `0x${typeId.toString(16)}`,
-        requestId: message.requestId,
-        type: message.type,
-        typeOf: typeof message.type,
-        ticker: message.ticker
-      }, 'Decoded data request');
-      return message;
-    } catch (error) {
+    } catch (err) {
      if (!this.isShutdown) {
-        this.logger.error({ error: error.message, stack: error.stack }, 'Error receiving data request');
+        this.logger.error({ error: err.message }, 'DEALER receive loop error');
      }
-      return null;
    }
  }

-
-  /**
-   * Start listening for control messages in the background
-   * @param {Function} handler - Callback function to handle control messages
-   *
-   * NOTE: Control channel not implemented yet. This is a stub for future use.
-   * For now, just log and ignore.
-   */
-  startControlListener(handler) {
-    this.logger.info('Control channel listener stub - not implemented yet');
-    // TODO: Implement control channel when needed
-    // Control messages would be used for:
-    // - Canceling realtime subscriptions
-    // - Graceful shutdown signals
-    // - Configuration updates
-  }
-
-  /**
-   * Shutdown and close connections
-   */
  async shutdown() {
    this.isShutdown = true;
-    this.logger.info('Shutting down ZMQ connections');
-
-    if (this.workSocket) {
-      await this.workSocket.close();
+    if (this._idleHeartbeatInterval) {
+      clearInterval(this._idleHeartbeatInterval);
+      this._idleHeartbeatInterval = null;
+    }
+    this.logger.info('Shutting down ZMQ DEALER connection');
+    if (this.dealerSocket) {
+      this.dealerSocket.close();
    }
  }
 }
--- a/protobuf/ingestor.proto
+++ b/protobuf/ingestor.proto
@@ -24,6 +24,9 @@ message DataRequest {
  // Flink uses this to determine notification topic
  optional string client_id = 6;

+  // Job ID assigned by the IngestorBroker for work tracking and heartbeating
+  optional string job_id = 7;
+
  enum RequestType {
    HISTORICAL_OHLC = 0;
    REALTIME_TICKS = 1;
@@ -327,3 +330,40 @@ message FieldValue {
    uint64 timestamp_val = 6;
  }
 }
+
+// ─── Ingestor Broker Protocol (Flink ROUTER ↔ Ingestor DEALER, port 5567) ───
+// Message type IDs 0x20–0x25
+
+// Ingestor → Flink: register as available (type 0x20)
+// Sent on DEALER connect and after every COMPLETE.
+message WorkerReady {
+  // Exchanges this ingestor supports (e.g. ["BINANCE", "COINBASE"])
+  repeated string exchanges = 1;
+}
+
+// Ingestor → Flink: historical job finished (type 0x21)
+message WorkComplete {
+  string job_id = 1;
+  bool success = 2;
+  optional string error_message = 3;
+}
+
+// Ingestor → Flink: realtime job still alive — sent every 5s (type 0x22)
+message WorkHeartbeat {
+  string job_id = 1;
+}
+
+// Ingestor → Flink: unable to handle this job (type 0x23)
+message WorkReject {
+  string job_id = 1;
+  string reason = 2;
+}
+
+// Flink → Ingestor: dispatch a job — wraps DataRequest (type 0x24)
+// DataRequest.job_id is populated by IngestorBroker
+// (DataRequest itself is type 0x01; this is the framing type for broker dispatch)
+
+// Flink → Ingestor: stop a realtime job (type 0x25)
+message WorkStop {
+  string job_id = 1;
+}
--- a/protobuf/ohlc.proto
+++ b/protobuf/ohlc.proto
@@ -58,4 +58,8 @@ message OHLCBatchMetadata {

  // Error message if status is ERROR
  optional string error_message = 8;
+
+  // True on the final page of a historical query (including error/not-found markers).
+  // Flink publishes HistoryReadyNotification only when this is true.
+  bool is_last_page = 9;
 }
--- a/relay/src/config.rs
+++ b/relay/src/config.rs
@@ -16,14 +16,15 @@ pub struct Config {
    #[serde(default = "default_market_data_pub_port")]
    pub market_data_pub_port: u16,

-    /// Ingestor work queue port (PUB - publish work with exchange prefix)
-    #[serde(default = "default_ingestor_work_port")]
-    pub ingestor_work_port: u16,
-
-    /// Flink market data endpoint (XSUB - relay subscribes to Flink)
+    /// Flink market data endpoint (XSUB - relay subscribes to Flink XPUB)
    #[serde(default = "default_flink_market_data_endpoint")]
    pub flink_market_data_endpoint: String,

+    /// Flink request endpoint (PUSH - relay forwards client requests to Flink PULL)
+    /// Flink's IngestorBroker binds a PULL socket on port 5566
+    #[serde(default = "default_flink_request_endpoint")]
+    pub flink_request_endpoint: String,
+
    /// Request timeout in seconds
    #[serde(default = "default_request_timeout_secs")]
    pub request_timeout_secs: u64,
@@ -45,14 +46,14 @@ fn default_market_data_pub_port() -> u16 {
    5558
 }

-fn default_ingestor_work_port() -> u16 {
-    5555
-}
-
 fn default_flink_market_data_endpoint() -> String {
    "tcp://flink-jobmanager:5558".to_string()
 }

+fn default_flink_request_endpoint() -> String {
+    "tcp://flink-jobmanager:5566".to_string()
+}
+
 fn default_request_timeout_secs() -> u64 {
    30
 }
@@ -67,8 +68,8 @@ impl Default for Config {
            bind_address: default_bind_address(),
            client_request_port: default_client_request_port(),
            market_data_pub_port: default_market_data_pub_port(),
-            ingestor_work_port: default_ingestor_work_port(),
            flink_market_data_endpoint: default_flink_market_data_endpoint(),
+            flink_request_endpoint: default_flink_request_endpoint(),
            request_timeout_secs: default_request_timeout_secs(),
            high_water_mark: default_hwm(),
        }
--- a/relay/src/relay.rs
+++ b/relay/src/relay.rs
@@ -7,8 +7,6 @@ use tracing::{debug, error, info, warn};
 const PROTOCOL_VERSION: u8 = 0x01;
 const MSG_TYPE_SUBMIT_REQUEST: u8 = 0x10;
 const MSG_TYPE_SUBMIT_RESPONSE: u8 = 0x11;
-const MSG_TYPE_DATA_REQUEST: u8 = 0x01;
-const MSG_TYPE_HISTORY_READY: u8 = 0x12;

 pub struct Relay {
    config: Config,
@@ -26,24 +24,21 @@ impl Relay {
    }

    pub async fn run(self) -> Result<()> {
-        info!("Initializing Stateless ZMQ Relay");
+        info!("Initializing ZMQ Relay");

-        // Bind sockets
        let client_request_socket = self.create_client_request_socket()?;
        let market_data_frontend = self.create_market_data_frontend()?;
        let market_data_backend = self.create_market_data_backend()?;
-        let ingestor_work_socket = self.create_ingestor_work_socket()?;
+        let flink_request_socket = self.create_flink_request_socket()?;

-        info!("All sockets initialized successfully - relay is STATELESS");
-        info!("No pending requests tracked - all async via pub/sub");
+        info!("All sockets initialized — relay forwards requests to Flink");

-        // Run main loop
        tokio::task::spawn_blocking(move || {
            Self::proxy_loop(
                client_request_socket,
                market_data_frontend,
                market_data_backend,
-                ingestor_work_socket,
+                flink_request_socket,
            )
        })
        .await?
@@ -58,7 +53,6 @@ impl Relay {
        let endpoint = format!("{}:{}", self.config.bind_address, self.config.client_request_port);
        socket.bind(&endpoint)?;
        info!("Client request socket (ROUTER) bound to {}", endpoint);
-        info!("  → Accepts SubmitHistoricalRequest, returns SubmitResponse immediately");

        Ok(socket)
    }
@@ -71,7 +65,7 @@ impl Relay {
        let endpoint = format!("{}:{}", self.config.bind_address, self.config.market_data_pub_port);
        socket.bind(&endpoint)?;
        info!("Market data frontend (XPUB) bound to {}", endpoint);
-        info!("  → Clients subscribe here for HistoryReadyNotification and market data");
+        info!("  → Clients subscribe here; subscription events forwarded to Flink for realtime activation");

        Ok(socket)
    }
@@ -82,20 +76,19 @@ impl Relay {

        socket.connect(&self.config.flink_market_data_endpoint)?;
        info!("Market data backend (XSUB) connected to {}", self.config.flink_market_data_endpoint);
-        info!("  → Receives HistoryReadyNotification and market data from Flink");
+        info!("  → Receives market data and notifications from Flink");

        Ok(socket)
    }

-    fn create_ingestor_work_socket(&self) -> Result<zmq::Socket> {
-        let socket = self.context.socket(zmq::PUB)?;
+    fn create_flink_request_socket(&self) -> Result<zmq::Socket> {
+        let socket = self.context.socket(zmq::PUSH)?;
        socket.set_sndhwm(self.config.high_water_mark)?;
        socket.set_linger(1000)?;

-        let endpoint = format!("{}:{}", self.config.bind_address, self.config.ingestor_work_port);
-        socket.bind(&endpoint)?;
-        info!("Ingestor work queue (PUB) bound to {}", endpoint);
-        info!("  → Publishes DataRequest with exchange prefix");
+        socket.connect(&self.config.flink_request_endpoint)?;
+        info!("Flink request socket (PUSH) connected to {}", self.config.flink_request_endpoint);
+        info!("  → Forwards SubmitHistoricalRequest to Flink for dispatch to ingestors");

        Ok(socket)
    }
@@ -104,7 +97,7 @@ impl Relay {
        client_request_socket: zmq::Socket,
        market_data_frontend: zmq::Socket,
        market_data_backend: zmq::Socket,
-        ingestor_work_socket: zmq::Socket,
+        flink_request_socket: zmq::Socket,
    ) -> Result<()> {
        let mut items = [
            client_request_socket.as_poll_item(zmq::POLLIN),
@@ -112,10 +105,9 @@ impl Relay {
            market_data_backend.as_poll_item(zmq::POLLIN),
        ];

-        info!("Entering stateless proxy loop");
+        info!("Entering relay proxy loop");

        loop {
-            // Poll with 100ms timeout
            zmq::poll(&mut items, 100)
                .context("Failed to poll sockets")?;

@@ -123,21 +115,20 @@ impl Relay {
            if items[0].is_readable() {
                if let Err(e) = Self::handle_client_submission(
                    &client_request_socket,
-                    &ingestor_work_socket,
+                    &flink_request_socket,
                ) {
                    error!("Error handling client submission: {}", e);
                }
            }

-            // Handle market data subscriptions from clients (XPUB → XSUB)
+            // Proxy client subscription events → Flink (XPUB → XSUB)
            if items[1].is_readable() {
                if let Err(e) = Self::proxy_subscription(&market_data_frontend, &market_data_backend) {
                    error!("Error proxying subscription: {}", e);
                }
            }

-            // Handle market data from Flink (XSUB → XPUB)
-            // This includes HistoryReadyNotification and regular market data
+            // Proxy market data from Flink → clients (XSUB → XPUB)
            if items[2].is_readable() {
                if let Err(e) = Self::proxy_market_data(&market_data_backend, &market_data_frontend) {
                    error!("Error proxying market data: {}", e);
@@ -148,7 +139,7 @@ impl Relay {

    fn handle_client_submission(
        client_socket: &zmq::Socket,
-        ingestor_socket: &zmq::Socket,
+        flink_socket: &zmq::Socket,
    ) -> Result<()> {
        // Receive from client: [identity][empty][version][message]
        let identity = client_socket.recv_bytes(0)?;
@@ -177,7 +168,7 @@ impl Relay {
                    identity,
                    payload,
                    client_socket,
-                    ingestor_socket,
+                    flink_socket,
                )?;
            }
            _ => {
@@ -192,61 +183,27 @@ impl Relay {
        client_identity: Vec<u8>,
        payload: &[u8],
        client_socket: &zmq::Socket,
-        ingestor_socket: &zmq::Socket,
+        flink_socket: &zmq::Socket,
    ) -> Result<()> {
-        // Parse protobuf request
+        // Parse just enough to build the SubmitResponse — relay stays thin
        let request = proto::SubmitHistoricalRequest::decode(payload)
            .context("Failed to parse SubmitHistoricalRequest")?;

        let request_id = request.request_id.clone();
-        let ticker = request.ticker.clone();
        let client_id = request.client_id.clone();

-        info!("Handling request submission: request_id={}, ticker={}, client_id={:?}",
-            request_id, ticker, client_id);
+        info!("Forwarding request to Flink: request_id={}, ticker={}", request_id, request.ticker);

-        // Extract exchange suffix from ticker (Nautilus format: "BTC/USDT.BINANCE")
-        let exchange_prefix = ticker.rsplitn(2, '.').next()
-            .map(|s| format!("{}.", s))
-            .unwrap_or_else(|| String::from(""));
-
-        if exchange_prefix.is_empty() {
-            warn!("Ticker '{}' missing exchange suffix", ticker);
-        }
-
-        // Build DataRequest protobuf for ingestors
-        let data_request = proto::DataRequest {
-            request_id: request_id.clone(),
-            r#type: proto::data_request::RequestType::HistoricalOhlc as i32,
-            ticker: ticker.clone(),
-            historical: Some(proto::HistoricalParams {
-                start_time: request.start_time,
-                end_time: request.end_time,
-                period_seconds: request.period_seconds,
-                limit: request.limit,
-            }),
-            realtime: None,
-            client_id: client_id.clone(),
-        };
-
-        let mut data_request_bytes = Vec::new();
-        data_request.encode(&mut data_request_bytes)?;
-
-        // Publish to ingestors with exchange prefix
+        // Forward the raw request to Flink via PUSH
+        // Flink builds DataRequest and dispatches to ingestors via IngestorBroker
        let version_frame = vec![PROTOCOL_VERSION];
-        let mut message_frame = vec![MSG_TYPE_DATA_REQUEST];
-        message_frame.extend_from_slice(&data_request_bytes);
+        let mut message_frame = vec![MSG_TYPE_SUBMIT_REQUEST];
+        message_frame.extend_from_slice(payload);

-        ingestor_socket.send(&exchange_prefix, zmq::SNDMORE)?;
-        ingestor_socket.send(&version_frame, zmq::SNDMORE)?;
-        ingestor_socket.send(&message_frame, 0)?;
+        flink_socket.send(&version_frame, zmq::SNDMORE)?;
+        flink_socket.send(&message_frame, 0)?;

-        info!("Published to ingestors: prefix={}, request_id={}", exchange_prefix, request_id);
-
-        // Build SubmitResponse protobuf
-        // NOTE: This topic is DETERMINISTIC based on client-generated values.
-        // Client should have already subscribed to this topic BEFORE sending the request
-        // to prevent race condition where notification arrives before client subscribes.
+        // Build SubmitResponse — relay still acks the client immediately
        let notification_topic = if let Some(cid) = &client_id {
            format!("RESPONSE:{}", cid)
        } else {
@@ -263,20 +220,16 @@ impl Relay {
        let mut response_bytes = Vec::new();
        response.encode(&mut response_bytes)?;

-        // Send immediate response to client
        let version_frame = vec![PROTOCOL_VERSION];
-        let mut message_frame = vec![MSG_TYPE_SUBMIT_RESPONSE];
-        message_frame.extend_from_slice(&response_bytes);
+        let mut resp_message_frame = vec![MSG_TYPE_SUBMIT_RESPONSE];
+        resp_message_frame.extend_from_slice(&response_bytes);

        client_socket.send(&client_identity, zmq::SNDMORE)?;
        client_socket.send(&[] as &[u8], zmq::SNDMORE)?;
        client_socket.send(&version_frame, zmq::SNDMORE)?;
-        client_socket.send(&message_frame, 0)?;
+        client_socket.send(&resp_message_frame, 0)?;

-        info!("Sent SubmitResponse to client: request_id={}, topic={}", request_id, notification_topic);
-
-        // Relay is now DONE with this request - completely stateless!
-        // Client will receive notification via pub/sub when Flink publishes HistoryReadyNotification
+        info!("Acked client and forwarded to Flink: request_id={}, notification_topic={}", request_id, notification_topic);

        Ok(())
    }
@@ -285,7 +238,7 @@ impl Relay {
        frontend: &zmq::Socket,
        backend: &zmq::Socket,
    ) -> Result<()> {
-        // Forward subscription message from XPUB to XSUB
+        // Forward subscription event from XPUB to XSUB so Flink can detect realtime interest
        let msg = frontend.recv_bytes(0)?;
        backend.send(&msg, 0)?;

@@ -302,10 +255,7 @@ impl Relay {
        backend: &zmq::Socket,
        frontend: &zmq::Socket,
    ) -> Result<()> {
-        // Forward all messages from XSUB to XPUB (zero-copy proxy)
-        // This includes:
-        // - Regular market data (ticks, OHLC)
-        // - HistoryReadyNotification from Flink
+        // Zero-copy proxy: XSUB (Flink) → XPUB (clients)
        loop {
            let msg = backend.recv_bytes(0)?;
            let more = backend.get_rcvmore()?;
--- a/sandbox/Dockerfile
+++ b/sandbox/Dockerfile
@@ -11,7 +11,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    && rm -rf /var/lib/apt/lists/*

 # Copy dependency specifications
-COPY setup.py .
 COPY environment.yml .
 COPY dexorder/ dexorder/

@@ -27,9 +26,6 @@ RUN mkdir -p dexorder/generated && \
 RUN conda env create -f environment.yml -p /build/env && \
    conda clean -afy

-# Install the local package into the conda environment
-RUN /build/env/bin/pip install --no-cache-dir .
-
 # =============================================================================
 # Runtime stage
 # =============================================================================
@@ -75,7 +71,8 @@ RUN chmod 755 /app/entrypoint.sh && chown root:root /app/entrypoint.sh
 USER dexorder

 # Environment variables (can be overridden in k8s)
-ENV PYTHONUNBUFFERED=1 \
+ENV PYTHONPATH=/app \
+    PYTHONUNBUFFERED=1 \
    MPLCONFIGDIR=/tmp \
    NUMBA_CACHE_DIR=/tmp/numba_cache \
    LOG_LEVEL=INFO \
--- a/sandbox/dexorder/api/init.py
+++ b/sandbox/dexorder/api/init.py
@@ -12,6 +12,7 @@ For research scripts, import and use get_api() to access the API:
 """

 import logging
+import threading
 from typing import Optional

 from dexorder.api.api import API
@@ -23,10 +24,13 @@ log = logging.getLogger(__name__)
 # Global API instance - managed by main.py
 _global_api: Optional[API] = None

+# Thread-local API — used by harness threads so they don't overwrite the global
+_thread_local = threading.local()
+

 def get_api() -> API:
    """
-    Get the global API instance for accessing market data and charts.
+    Get the API instance for accessing market data and charts.

    Use this in research scripts to access the data and charting APIs.

@@ -53,15 +57,27 @@ def get_api() -> API:
        # Create chart
        fig, ax = api.charting.plot_ohlc(df, title="BTC/USDT")
    """
+    # Thread-local takes priority (set by harness threads)
+    api = getattr(_thread_local, 'api', None)
+    if api is not None:
+        return api
    if _global_api is None:
        raise RuntimeError("API not initialized")
    return _global_api


 def set_api(api: API) -> None:
-    """Set the global API instance. Internal use only."""
-    global _global_api
-    _global_api = api
+    """Set the API instance.
+
+    When called from the main thread, sets the global API used by all threads.
+    When called from a non-main thread (e.g. harness threads), sets a thread-local
+    API so the global is not overwritten.
+    """
+    if threading.current_thread() is threading.main_thread():
+        global _global_api
+        _global_api = api
+    else:
+        _thread_local.api = api


 __all__ = ['API', 'ChartingAPI', 'DataAPI', 'get_api', 'set_api']
--- a/sandbox/dexorder/conda_manager.py
+++ b/sandbox/dexorder/conda_manager.py
@@ -3,6 +3,12 @@ Conda Package Manager

 Manages dynamic installation and cleanup of conda packages for user components.
 Scans metadata files to determine required packages and syncs the conda environment.
+
+Extra packages (user-installed beyond the base container) are tracked in
+``extra_packages.json`` under ``data_dir`` so they can be removed when no
+script references them. Packages that are later promoted into the base image
+(i.e. appear in ``environment.yml``) are silently evicted from tracking
+rather than uninstalled.
 """

 import json
@@ -12,6 +18,10 @@ import sys
 from pathlib import Path
 from typing import Optional, Set

+# Filename (stored under data_dir, outside the git repo) for tracking
+# user-installed extra packages.
+EXTRA_PACKAGES_FILENAME = "extra_packages.json"
+
 log = logging.getLogger(__name__)


@@ -102,12 +112,35 @@ def get_installed_packages() -> Set[str]:
        return set()


-def install_packages(packages: list[str]) -> dict:
+def load_extra_packages(data_dir: Path) -> Set[str]:
+    """Load the set of user-installed extra packages (beyond the base container)."""
+    path = data_dir / EXTRA_PACKAGES_FILENAME
+    if path.exists():
+        try:
+            return set(json.loads(path.read_text()))
+        except Exception as e:
+            log.error(f"Failed to load extra packages: {e}")
+    return set()
+
+
+def save_extra_packages(data_dir: Path, packages: Set[str]) -> None:
+    """Persist the set of user-installed extra packages."""
+    path = data_dir / EXTRA_PACKAGES_FILENAME
+    try:
+        path.write_text(json.dumps(sorted(packages)))
+    except Exception as e:
+        log.error(f"Failed to save extra packages: {e}")
+
+
+def install_packages(packages: list[str], data_dir: Optional[Path] = None) -> dict:
    """
    Install conda packages if not already installed.

    Args:
        packages: List of package names to install
+        data_dir: If provided, newly installed packages are added to the extra
+                  package tracking file (``extra_packages.json``) so they can
+                  be cleaned up when no longer needed.

    Returns:
        dict with:
@@ -154,6 +187,10 @@ def install_packages(packages: list[str]) -> dict:

        if result.returncode == 0:
            log.info(f"Successfully installed packages: {to_install}")
+            if data_dir:
+                extras = load_extra_packages(data_dir)
+                extras.update(to_install)
+                save_extra_packages(data_dir, extras)
            return {
                "success": True,
                "installed": to_install,
@@ -324,9 +361,59 @@ def get_base_packages(environment_yml: Path) -> Set[str]:


 # =============================================================================
-# Sync Operation
+# Cleanup and Sync Operations
 # =============================================================================

+def cleanup_extra_packages(data_dir: Path, environment_yml: Optional[Path] = None) -> dict:
+    """
+    Remove tracked extra packages that are no longer needed by any script.
+
+    Only packages previously recorded in ``extra_packages.json`` are ever
+    considered for removal — base container packages are never touched.
+
+    Packages that have since been promoted into the base container image
+    (i.e. now appear in ``environment.yml``) are quietly evicted from the
+    tracking file without being uninstalled.
+
+    Args:
+        data_dir: Base data directory (tracking file lives here)
+        environment_yml: Path to environment.yml for base package reconciliation
+
+    Returns:
+        dict with:
+        - success: bool
+        - to_remove: list[str] - packages identified for removal
+        - removed: list[str] - packages actually removed
+        - error: str (if any)
+    """
+    src_dir = data_dir / "src"
+    required = scan_metadata_packages(src_dir)
+    base = get_base_packages(environment_yml) if environment_yml and environment_yml.exists() else set()
+    extras = load_extra_packages(data_dir)
+
+    # Packages promoted into the base image are no longer "extra" — evict from tracking
+    now_base = extras & base
+    if now_base:
+        log.info(f"Packages promoted to base image, evicting from extra tracking: {now_base}")
+        extras -= now_base
+
+    # Only remove packages that are tracked as extras and no longer referenced by any script
+    to_remove = sorted(extras - required)
+    result: dict = {"success": True, "to_remove": to_remove, "removed": []}
+
+    if to_remove:
+        remove_result = remove_packages(to_remove)
+        result["success"] = remove_result["success"]
+        result["removed"] = remove_result.get("removed", [])
+        if remove_result["success"]:
+            extras -= set(to_remove)
+        else:
+            result["error"] = remove_result.get("error")
+
+    save_extra_packages(data_dir, extras)
+    return result
+
+
 def sync_packages(data_dir: Path, environment_yml: Optional[Path] = None) -> dict:
    """
    Sync conda packages with metadata requirements.
@@ -350,8 +437,8 @@ def sync_packages(data_dir: Path, environment_yml: Optional[Path] = None) -> dic
    """
    log.info("Starting conda package sync")

-    # Get required packages from metadata
-    required_packages = scan_metadata_packages(data_dir)
+    # Metadata lives under data_dir/src/category/item/metadata.json
+    required_packages = scan_metadata_packages(data_dir / "src")
    log.info(f"Required packages from metadata: {required_packages}")

    # Get base packages from environment.yml
--- a/sandbox/dexorder/iceberg_client.py
+++ b/sandbox/dexorder/iceberg_client.py
@@ -42,6 +42,7 @@ class IcebergClient:
        s3_endpoint: Optional[str] = None,
        s3_access_key: Optional[str] = None,
        s3_secret_key: Optional[str] = None,
+        s3_region: Optional[str] = None,
    ):
        """
        Initialize Iceberg client.
@@ -52,6 +53,7 @@ class IcebergClient:
            s3_endpoint: S3/MinIO endpoint URL (e.g., "http://localhost:9000")
            s3_access_key: S3/MinIO access key
            s3_secret_key: S3/MinIO secret key
+            s3_region: S3/MinIO region (e.g., "us-east-1")
        """
        self.catalog_uri = catalog_uri
        self.namespace = namespace
@@ -64,6 +66,8 @@ class IcebergClient:
            catalog_props["s3.access-key-id"] = s3_access_key
        if s3_secret_key:
            catalog_props["s3.secret-access-key"] = s3_secret_key
+        if s3_region:
+            catalog_props["s3.region"] = s3_region

        self.catalog = load_catalog("trading", **catalog_props)
        self.table = self.catalog.load_table(f"{namespace}.ohlc")
--- a/sandbox/dexorder/impl/data_api_impl.py
+++ b/sandbox/dexorder/impl/data_api_impl.py
@@ -15,6 +15,13 @@ log = logging.getLogger(__name__)
 # Standard OHLC columns always returned
 STANDARD_COLUMNS = ["timestamp", "open", "high", "low", "close"]

+# All optional columns from the OHLC proto spec, returned by default when extra_columns=None
+OHLC_OPTIONAL_COLUMNS = [
+    "volume", "buy_vol", "sell_vol",
+    "open_time", "high_time", "low_time", "close_time",
+    "open_interest",
+]
+
 # All valid extra columns available in the Iceberg schema
 VALID_EXTRA_COLUMNS = {
    "volume", "buy_vol", "sell_vol",
@@ -43,6 +50,7 @@ class DataAPIImpl(DataAPI):
        s3_endpoint: Optional[str] = None,
        s3_access_key: Optional[str] = None,
        s3_secret_key: Optional[str] = None,
+        s3_region: Optional[str] = None,
        request_timeout: float = 30.0,
    ):
        """
@@ -56,6 +64,7 @@ class DataAPIImpl(DataAPI):
            s3_endpoint: S3/MinIO endpoint URL (e.g., "http://minio:9000")
            s3_access_key: S3/MinIO access key
            s3_secret_key: S3/MinIO secret key
+            s3_region: S3/MinIO region (e.g., "us-east-1")
            request_timeout: Default timeout for historical data requests in seconds (default: 30)
        """
        self.ohlc_client = OHLCClient(
@@ -66,6 +75,7 @@ class DataAPIImpl(DataAPI):
            s3_endpoint=s3_endpoint,
            s3_access_key=s3_access_key,
            s3_secret_key=s3_secret_key,
+            s3_region=s3_region,
        )
        self.request_timeout = request_timeout
        self._started = False
@@ -120,7 +130,9 @@ class DataAPIImpl(DataAPI):

        # Determine which columns to fetch
        columns_to_fetch = STANDARD_COLUMNS.copy()
-        if extra_columns:
+        if extra_columns is None:
+            columns_to_fetch.extend(OHLC_OPTIONAL_COLUMNS)
+        elif extra_columns:
            columns_to_fetch.extend(extra_columns)

        # Use OHLCClient which handles smart caching:
--- a/sandbox/dexorder/nautilus/backtest_runner.py
+++ b/sandbox/dexorder/nautilus/backtest_runner.py
@@ -93,6 +93,82 @@ def _load_strategy_class(impl_path: Path) -> type:
 # Metrics extraction
 # ---------------------------------------------------------------------------

+def _money_to_float(val) -> float | None:
+    """Convert a Nautilus Money object or string like '15.32 USDT' to float."""
+    if val is None:
+        return None
+    try:
+        if hasattr(val, "as_decimal"):
+            return float(val.as_decimal())
+        s = str(val).strip()
+        if s and s.lower() not in ("none", "nan"):
+            return float(s.split()[0])
+    except (ValueError, TypeError, IndexError):
+        pass
+    return None
+
+
+def _ts_to_s(raw) -> int | None:
+    """Convert a Nautilus nanosecond timestamp to Unix seconds."""
+    try:
+        return int(raw) // 1_000_000_000
+    except (TypeError, ValueError):
+        return None
+
+
+def _extract_fills(engine) -> pd.DataFrame:
+    """Return a sorted fills DataFrame from BacktestEngine, or empty DataFrame."""
+    try:
+        df = engine.trader.generate_order_fills_report()
+        if df is not None and len(df) > 0:
+            if "ts_event" in df.columns:
+                df = df.sort_values("ts_event")
+            return df
+    except Exception as exc:
+        log.debug("generate_order_fills_report() failed: %s", exc)
+    return pd.DataFrame()
+
+
+def _extract_trades(fills_df: pd.DataFrame, initial_capital: float) -> list[dict]:
+    """
+    Pair fills into round-trip trades: buy → sell or sell → buy.
+    Returns a list of trade dicts (capped at 500 for large backtests).
+    """
+    if fills_df.empty:
+        return []
+
+    trades: list[dict] = []
+    open_positions: dict[str, dict] = {}  # instrument_id -> pending entry
+
+    for _, fill in fills_df.iterrows():
+        instrument = str(fill.get("instrument_id", ""))
+        side = str(fill.get("order_side", "")).upper()
+        qty = _money_to_float(fill.get("last_qty")) or 0.0
+        price = _money_to_float(fill.get("last_px")) or 0.0
+        ts_s = _ts_to_s(fill.get("ts_event"))
+        rpnl = _money_to_float(fill.get("realized_pnl"))
+
+        if rpnl is not None and rpnl != 0.0:
+            # This fill closes a position — record as a completed trade
+            entry = open_positions.pop(instrument, None)
+            trade = {
+                "instrument": instrument,
+                "side": side,
+                "quantity": round(qty, 8),
+                "entry_price": round(entry["price"], 8) if entry else None,
+                "exit_price": round(price, 8),
+                "entry_time": entry["ts_s"] if entry else None,
+                "exit_time": ts_s,
+                "pnl": round(rpnl, 6),
+            }
+            trades.append(trade)
+        else:
+            # Opening fill — store for pairing
+            open_positions[instrument] = {"price": price, "ts_s": ts_s, "side": side}
+
+    return trades[:500]  # cap for large backtests
+
+
 def _compute_metrics(
    engine,
    venue_strs: list[str],
@@ -100,17 +176,18 @@ def _compute_metrics(
    all_bars: list,
 ) -> dict[str, Any]:
    """
-    Extract performance metrics from a completed BacktestEngine.
+    Extract structured performance metrics from a completed BacktestEngine.

    Returns dict with:
-        total_return   float  — fractional (0.15 = +15%)
-        sharpe_ratio   float  — annualized; 0.0 if no trades or constant equity
-        max_drawdown   float  — max peak-to-trough as fraction (0.10 = 10% drawdown)
-        win_rate       float  — fraction of trades with positive realized PnL
-        trade_count    int
-        equity_curve   list[{timestamp: int_unix_s, equity: float}]
+        summary      dict  — core metrics (total_return, sharpe, drawdown, etc.)
+        statistics   dict  — extended stats (sortino, calmar, profit_factor, etc.)
+        trades       list  — individual round-trip trades (capped at 500)
+        equity_curve list[{timestamp: int_unix_s, equity: float}]
    """
-    # Reconstruct equity curve from fills
+    fills_df = _extract_fills(engine)
+    trades = _extract_trades(fills_df, initial_capital)
+
+    # --- Equity curve reconstruction ---
    equity_points: list[dict] = []
    if all_bars:
        equity_points.append({
@@ -121,51 +198,24 @@ def _compute_metrics(
    running_equity = initial_capital
    trade_count = 0
    winning_trades = 0
+    total_profit = 0.0
+    total_loss = 0.0

-    try:
-        fills_df = engine.trader.generate_order_fills_report()
-    except Exception as exc:
-        log.debug("generate_order_fills_report() failed: %s", exc)
-        fills_df = None
-
-    if fills_df is not None and len(fills_df) > 0:
-        # Sort by event time
-        if "ts_event" in fills_df.columns:
-            fills_df = fills_df.sort_values("ts_event")
-
+    if not fills_df.empty:
        for _, fill in fills_df.iterrows():
-            rpnl = fill.get("realized_pnl") if hasattr(fill, "get") else None
-            if rpnl is None:
+            rpnl = _money_to_float(fill.get("realized_pnl"))
+            if rpnl is None or rpnl == 0.0:
                continue
-
-            # Nautilus Money objects: str form is "15.32 USDT"
-            rpnl_float: float | None = None
-            try:
-                if hasattr(rpnl, "as_decimal"):
-                    rpnl_float = float(rpnl.as_decimal())
-                elif rpnl is not None:
-                    rpnl_str = str(rpnl).strip()
-                    if rpnl_str and rpnl_str.lower() not in ("none", "nan"):
-                        rpnl_float = float(rpnl_str.split()[0])
-            except (ValueError, TypeError, IndexError):
-                pass
-
-            if rpnl_float is not None and rpnl_float != 0.0:
-                ts_s: int | None = None
-                raw_ts = fill.get("ts_event") if hasattr(fill, "get") else None
-                if raw_ts is not None:
-                    try:
-                        ts_s = int(raw_ts) // 1_000_000_000
-                    except (TypeError, ValueError):
-                        pass
-
-                running_equity += rpnl_float
-                trade_count += 1
-                if rpnl_float > 0:
-                    winning_trades += 1
-
-                if ts_s is not None:
-                    equity_points.append({"timestamp": ts_s, "equity": running_equity})
+            ts_s = _ts_to_s(fill.get("ts_event"))
+            running_equity += rpnl
+            trade_count += 1
+            if rpnl > 0:
+                winning_trades += 1
+                total_profit += rpnl
+            else:
+                total_loss += abs(rpnl)
+            if ts_s is not None:
+                equity_points.append({"timestamp": ts_s, "equity": running_equity})

    if all_bars:
        equity_points.append({
@@ -173,19 +223,16 @@ def _compute_metrics(
            "equity": running_equity,
        })

-    # Try to get actual final balance from the account (more accurate than fill reconstruction)
+    # Prefer definitive final balance from account cache
    try:
        from nautilus_trader.model.identifiers import Venue
        for venue_str in venue_strs:
            account = engine.cache.account_for_venue(Venue(venue_str))
            if account is None:
                continue
-            # Sum all balances (quote currency is what we started with)
            for bal in account.balances().values():
-                total = getattr(bal, "total", None)
-                if total is not None:
-                    final_val = float(str(total).split()[0]) if not hasattr(total, "as_decimal") else float(total.as_decimal())
-                    # Use the account balance as the definitive final equity
+                final_val = _money_to_float(getattr(bal, "total", None))
+                if final_val is not None:
                    running_equity = final_val
                    if equity_points:
                        equity_points[-1]["equity"] = running_equity
@@ -193,36 +240,71 @@ def _compute_metrics(
    except Exception as exc:
        log.debug("Account balance extraction failed: %s", exc)

-    # Core metrics
+    # --- Core metrics ---
    total_return = (running_equity - initial_capital) / initial_capital if initial_capital else 0.0
    win_rate = winning_trades / trade_count if trade_count > 0 else 0.0
+    profit_factor = (total_profit / total_loss) if total_loss > 0 else (float("inf") if total_profit > 0 else 0.0)
+
+    # Determine bar duration for annualisation
+    bar_duration_ns = 0.0
+    if all_bars and len(all_bars) > 1:
+        bar_duration_ns = (all_bars[-1].ts_event - all_bars[0].ts_event) / max(len(all_bars) - 1, 1)
+    bars_per_year = (365 * 24 * 3600 * 1e9) / bar_duration_ns if bar_duration_ns > 0 else 0.0
+
+    equity_series = pd.Series([p["equity"] for p in equity_points]) if len(equity_points) > 2 else pd.Series([initial_capital, running_equity])
+    returns = equity_series.pct_change().dropna()

-    # Sharpe ratio (annualized) from equity curve returns
    sharpe = 0.0
-    if len(equity_points) > 2 and all_bars and len(all_bars) > 1:
-        equity_series = pd.Series([p["equity"] for p in equity_points])
-        returns = equity_series.pct_change().dropna()
-        if len(returns) > 1 and returns.std() > 0:
-            bar_duration_ns = (all_bars[-1].ts_event - all_bars[0].ts_event) / max(len(all_bars) - 1, 1)
-            if bar_duration_ns > 0:
-                bars_per_year = (365 * 24 * 3600 * 1e9) / bar_duration_ns
-                sharpe = float((returns.mean() / returns.std()) * (bars_per_year ** 0.5))
+    sortino = 0.0
+    if len(returns) > 1 and bars_per_year > 0:
+        mean_r = returns.mean()
+        std_r = returns.std()
+        if std_r > 0:
+            sharpe = float((mean_r / std_r) * (bars_per_year ** 0.5))
+        downside = returns[returns < 0]
+        downside_std = downside.std() if len(downside) > 1 else 0.0
+        if downside_std > 0:
+            sortino = float((mean_r / downside_std) * (bars_per_year ** 0.5))

    # Max drawdown
    max_drawdown = 0.0
-    if len(equity_points) > 1:
-        equity_arr = pd.Series([p["equity"] for p in equity_points])
-        rolling_max = equity_arr.cummax()
-        drawdowns = (equity_arr - rolling_max) / rolling_max.replace(0, float("nan"))
+    if len(equity_series) > 1:
+        rolling_max = equity_series.cummax()
+        drawdowns = (equity_series - rolling_max) / rolling_max.replace(0, float("nan"))
        max_drawdown = float(abs(drawdowns.min())) if len(drawdowns) > 0 else 0.0

+    # Calmar ratio
+    annualized_return = 0.0
+    if bars_per_year > 0 and len(all_bars) > 1:
+        years = (all_bars[-1].ts_event - all_bars[0].ts_event) / (365 * 24 * 3600 * 1e9)
+        if years > 0:
+            annualized_return = (running_equity / initial_capital) ** (1.0 / years) - 1 if initial_capital else 0.0
+    calmar = annualized_return / max_drawdown if max_drawdown > 0 else 0.0
+
+    # Average win / average loss
+    avg_win = total_profit / winning_trades if winning_trades > 0 else 0.0
+    avg_loss = total_loss / (trade_count - winning_trades) if (trade_count - winning_trades) > 0 else 0.0
+
    return {
-        "total_return":  round(total_return, 6),
-        "sharpe_ratio":  round(sharpe, 4),
-        "max_drawdown":  round(max_drawdown, 6),
-        "win_rate":      round(win_rate, 4),
-        "trade_count":   trade_count,
-        "equity_curve":  equity_points,
+        "summary": {
+            "total_return":   round(total_return, 6),
+            "sharpe_ratio":   round(sharpe, 4),
+            "max_drawdown":   round(max_drawdown, 6),
+            "win_rate":       round(win_rate, 4),
+            "trade_count":    trade_count,
+            "total_trades":   len(trades),
+        },
+        "statistics": {
+            "sortino_ratio":  round(sortino, 4),
+            "calmar_ratio":   round(calmar, 4),
+            "profit_factor":  round(profit_factor, 4) if profit_factor != float("inf") else None,
+            "avg_win":        round(avg_win, 4),
+            "avg_loss":       round(avg_loss, 4),
+            "total_profit":   round(total_profit, 4),
+            "total_loss":     round(total_loss, 4),
+        },
+        "trades":       trades,
+        "equity_curve": equity_points,
    }


--- a/sandbox/dexorder/nautilus/data_adapter.py
+++ b/sandbox/dexorder/nautilus/data_adapter.py
@@ -13,6 +13,7 @@ make_instrument_from_metadata — instrument with best-effort precision
 from __future__ import annotations

 import logging
+from decimal import Decimal
 from typing import Optional

 import pandas as pd
@@ -71,8 +72,8 @@ def make_instrument(
    size_precision: int = 8,
    tick_size: Optional[float] = None,
    lot_size: Optional[float] = None,
-    maker_fee: float = 0.001,
-    taker_fee: float = 0.001,
+    maker_fee: float = 0.0,
+    taker_fee: float = 0.0,
    margin_init: float = 0.0,
    margin_maint: float = 0.0,
 ) -> CurrencyPair:
@@ -118,8 +119,8 @@ def make_instrument(
        min_price=None,
        margin_init=margin_init,
        margin_maint=margin_maint,
-        maker_fee=maker_fee,
-        taker_fee=taker_fee,
+        maker_fee=Decimal(str(maker_fee)),
+        taker_fee=Decimal(str(taker_fee)),
        ts_event=ts_now,
        ts_init=ts_now,
    )
@@ -154,8 +155,8 @@ def make_instrument_from_metadata(ticker: str) -> tuple[CurrencyPair, int, int]:
                size_precision=sp,
                tick_size=meta.tick_size,
                lot_size=meta.lot_size,
-                maker_fee=meta.maker_fee or 0.001,
-                taker_fee=meta.taker_fee or 0.001,
+                maker_fee=meta.maker_fee or 0.0,
+                taker_fee=meta.taker_fee or 0.0,
                margin_init=meta.margin_init or 0.0,
                margin_maint=meta.margin_maint or 0.0,
            )
--- a/sandbox/dexorder/ohlc_client.py
+++ b/sandbox/dexorder/ohlc_client.py
@@ -39,6 +39,7 @@ class OHLCClient:
        s3_endpoint: str = None,
        s3_access_key: str = None,
        s3_secret_key: str = None,
+        s3_region: str = None,
    ):
        """
        Initialize OHLC client.
@@ -51,12 +52,14 @@ class OHLCClient:
            s3_endpoint: S3/MinIO endpoint URL (e.g., "http://localhost:9000")
            s3_access_key: S3/MinIO access key
            s3_secret_key: S3/MinIO secret key
+            s3_region: S3/MinIO region (e.g., "us-east-1")
        """
        self.iceberg = IcebergClient(
            iceberg_catalog_uri, namespace,
            s3_endpoint=s3_endpoint,
            s3_access_key=s3_access_key,
            s3_secret_key=s3_secret_key,
+            s3_region=s3_region,
        )
        self.history = HistoryClient(relay_endpoint, notification_endpoint)
        log.info("OHLCClient initialized")
@@ -122,7 +125,7 @@ class OHLCClient:

        if not missing_ranges:
            # All data exists in Iceberg
-            return self._forward_fill_gaps(df, period_seconds)
+            return df

        # Step 3: Request missing data for each range
        # For simplicity, request entire range (relay can merge adjacent requests)
--- a/sandbox/dexorder/strategy/init.py
+++ b/sandbox/dexorder/strategy/init.py
@@ -0,0 +1 @@
+# Strategy runtime package
--- a/sandbox/dexorder/strategy/db.py
+++ b/sandbox/dexorder/strategy/db.py
@@ -0,0 +1,361 @@
+"""
+SQLite database for strategy execution state, trade logs, and backtest history.
+
+All data is stored under DATA_DIR/dexorder.db.
+Uses aiosqlite for async compatibility with the MCP server's event loop.
+"""
+from __future__ import annotations
+
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Any, Optional
+
+log = logging.getLogger(__name__)
+
+_SCHEMA = """
+CREATE TABLE IF NOT EXISTS strategies (
+    name TEXT PRIMARY KEY,
+    status TEXT NOT NULL DEFAULT 'stopped',
+    git_rev TEXT,
+    worktree_path TEXT,
+    started_at REAL,
+    stopped_at REAL,
+    allocation REAL NOT NULL DEFAULT 0,
+    paper INTEGER NOT NULL DEFAULT 1,
+    feeds_json TEXT,
+    config_json TEXT
+);
+
+CREATE TABLE IF NOT EXISTS strategy_state (
+    name TEXT PRIMARY KEY,
+    realized_pnl REAL NOT NULL DEFAULT 0,
+    unrealized_pnl REAL NOT NULL DEFAULT 0,
+    trade_count INTEGER NOT NULL DEFAULT 0,
+    positions_json TEXT,
+    updated_at REAL NOT NULL DEFAULT 0
+);
+
+CREATE TABLE IF NOT EXISTS trades (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    strategy_name TEXT NOT NULL,
+    instrument TEXT NOT NULL,
+    side TEXT NOT NULL,
+    quantity REAL NOT NULL,
+    entry_price REAL,
+    exit_price REAL NOT NULL,
+    entry_time REAL,
+    exit_time REAL NOT NULL,
+    pnl REAL NOT NULL,
+    recorded_at REAL NOT NULL DEFAULT (unixepoch())
+);
+
+CREATE TABLE IF NOT EXISTS backtest_runs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    strategy_name TEXT NOT NULL,
+    ran_at REAL NOT NULL DEFAULT (unixepoch()),
+    from_time REAL,
+    to_time REAL,
+    initial_capital REAL,
+    feeds_json TEXT,
+    summary_json TEXT,
+    statistics_json TEXT,
+    trades_json TEXT,
+    equity_curve_json TEXT
+);
+
+CREATE TABLE IF NOT EXISTS strategy_events (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    strategy_name TEXT NOT NULL,
+    event_type TEXT NOT NULL,
+    payload_json TEXT,
+    recorded_at REAL NOT NULL DEFAULT (unixepoch())
+);
+
+CREATE INDEX IF NOT EXISTS idx_trades_strategy ON trades(strategy_name);
+CREATE INDEX IF NOT EXISTS idx_backtest_strategy ON backtest_runs(strategy_name);
+CREATE INDEX IF NOT EXISTS idx_events_strategy ON strategy_events(strategy_name);
+"""
+
+
+class StrategyDB:
+    """Async SQLite interface for strategy persistence."""
+
+    def __init__(self, db_path: Path):
+        self.db_path = db_path
+
+    async def initialize(self) -> None:
+        """Create tables if they don't exist."""
+        import aiosqlite
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        async with aiosqlite.connect(self.db_path) as db:
+            await db.executescript(_SCHEMA)
+            await db.commit()
+        log.info("StrategyDB initialized at %s", self.db_path)
+
+    # ------------------------------------------------------------------
+    # Strategy lifecycle
+    # ------------------------------------------------------------------
+
+    async def upsert_strategy(
+        self,
+        name: str,
+        status: str,
+        allocation: float,
+        paper: bool,
+        feeds: list[dict],
+        git_rev: Optional[str] = None,
+        worktree_path: Optional[str] = None,
+        config: Optional[dict] = None,
+    ) -> None:
+        import aiosqlite
+        now = time.time()
+        async with aiosqlite.connect(self.db_path) as db:
+            await db.execute("""
+                INSERT INTO strategies
+                    (name, status, git_rev, worktree_path, started_at, allocation, paper, feeds_json, config_json)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                ON CONFLICT(name) DO UPDATE SET
+                    status=excluded.status,
+                    git_rev=excluded.git_rev,
+                    worktree_path=excluded.worktree_path,
+                    started_at=excluded.started_at,
+                    allocation=excluded.allocation,
+                    paper=excluded.paper,
+                    feeds_json=excluded.feeds_json,
+                    config_json=excluded.config_json
+            """, (
+                name, status, git_rev, worktree_path, now,
+                allocation, int(paper),
+                json.dumps(feeds),
+                json.dumps(config or {}),
+            ))
+            await db.commit()
+
+    async def update_strategy_status(self, name: str, status: str) -> None:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            if status == "stopped":
+                await db.execute(
+                    "UPDATE strategies SET status=?, stopped_at=? WHERE name=?",
+                    (status, time.time(), name)
+                )
+            else:
+                await db.execute("UPDATE strategies SET status=? WHERE name=?", (status, name))
+            await db.commit()
+
+    async def get_strategy(self, name: str) -> Optional[dict]:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            async with db.execute("SELECT * FROM strategies WHERE name=?", (name,)) as cur:
+                row = await cur.fetchone()
+                return dict(row) if row else None
+
+    async def get_all_strategies(self) -> list[dict]:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            async with db.execute("SELECT * FROM strategies ORDER BY started_at DESC") as cur:
+                rows = await cur.fetchall()
+                return [dict(r) for r in rows]
+
+    async def get_running_strategies(self) -> list[dict]:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            async with db.execute(
+                "SELECT * FROM strategies WHERE status='running' OR status='starting'",
+            ) as cur:
+                rows = await cur.fetchall()
+                return [dict(r) for r in rows]
+
+    # ------------------------------------------------------------------
+    # PnL state
+    # ------------------------------------------------------------------
+
+    async def update_pnl_state(
+        self,
+        name: str,
+        realized_pnl: float,
+        unrealized_pnl: float,
+        trade_count: int,
+        positions: Optional[dict] = None,
+    ) -> None:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            await db.execute("""
+                INSERT INTO strategy_state
+                    (name, realized_pnl, unrealized_pnl, trade_count, positions_json, updated_at)
+                VALUES (?, ?, ?, ?, ?, ?)
+                ON CONFLICT(name) DO UPDATE SET
+                    realized_pnl=excluded.realized_pnl,
+                    unrealized_pnl=excluded.unrealized_pnl,
+                    trade_count=excluded.trade_count,
+                    positions_json=excluded.positions_json,
+                    updated_at=excluded.updated_at
+            """, (name, realized_pnl, unrealized_pnl, trade_count,
+                  json.dumps(positions or {}), time.time()))
+            await db.commit()
+
+    async def get_pnl_state(self, name: str) -> Optional[dict]:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            async with db.execute("SELECT * FROM strategy_state WHERE name=?", (name,)) as cur:
+                row = await cur.fetchone()
+                return dict(row) if row else None
+
+    # ------------------------------------------------------------------
+    # Trades
+    # ------------------------------------------------------------------
+
+    async def insert_trade(self, strategy_name: str, trade: dict) -> None:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            await db.execute("""
+                INSERT INTO trades
+                    (strategy_name, instrument, side, quantity, entry_price,
+                     exit_price, entry_time, exit_time, pnl)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """, (
+                strategy_name,
+                trade.get("instrument", ""),
+                trade.get("side", ""),
+                trade.get("quantity", 0),
+                trade.get("entry_price"),
+                trade.get("exit_price", 0),
+                trade.get("entry_time"),
+                trade.get("exit_time", time.time()),
+                trade.get("pnl", 0),
+            ))
+            await db.commit()
+
+    async def get_trades(self, strategy_name: str, limit: int = 200) -> list[dict]:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            async with db.execute(
+                "SELECT * FROM trades WHERE strategy_name=? ORDER BY exit_time DESC LIMIT ?",
+                (strategy_name, limit),
+            ) as cur:
+                rows = await cur.fetchall()
+                return [dict(r) for r in rows]
+
+    # ------------------------------------------------------------------
+    # Backtest runs
+    # ------------------------------------------------------------------
+
+    async def insert_backtest(
+        self,
+        strategy_name: str,
+        from_time: Any,
+        to_time: Any,
+        initial_capital: float,
+        feeds: list[dict],
+        summary: dict,
+        statistics: dict,
+        trades: list[dict],
+        equity_curve: list[dict],
+    ) -> int:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            cur = await db.execute("""
+                INSERT INTO backtest_runs
+                    (strategy_name, from_time, to_time, initial_capital, feeds_json,
+                     summary_json, statistics_json, trades_json, equity_curve_json)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """, (
+                strategy_name,
+                float(from_time) if from_time else None,
+                float(to_time) if to_time else None,
+                initial_capital,
+                json.dumps(feeds),
+                json.dumps(summary),
+                json.dumps(statistics),
+                json.dumps(trades[:500]),  # cap
+                json.dumps(equity_curve),
+            ))
+            await db.commit()
+            return cur.lastrowid
+
+    async def get_backtests(self, strategy_name: str, limit: int = 10) -> list[dict]:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            async with db.execute(
+                "SELECT * FROM backtest_runs WHERE strategy_name=? ORDER BY ran_at DESC LIMIT ?",
+                (strategy_name, limit),
+            ) as cur:
+                rows = await cur.fetchall()
+                result = []
+                for r in rows:
+                    d = dict(r)
+                    for key in ("feeds_json", "summary_json", "statistics_json",
+                                "trades_json", "equity_curve_json"):
+                        if d.get(key):
+                            plain = key.replace("_json", "")
+                            d[plain] = json.loads(d.pop(key))
+                        else:
+                            d.pop(key, None)
+                    result.append(d)
+                return result
+
+    # ------------------------------------------------------------------
+    # Events
+    # ------------------------------------------------------------------
+
+    async def insert_event(self, strategy_name: str, event_type: str, payload: dict) -> None:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            await db.execute(
+                "INSERT INTO strategy_events (strategy_name, event_type, payload_json) VALUES (?, ?, ?)",
+                (strategy_name, event_type, json.dumps(payload)),
+            )
+            await db.commit()
+
+    async def get_events(
+        self,
+        strategy_name: str,
+        event_type: Optional[str] = None,
+        limit: int = 100,
+    ) -> list[dict]:
+        import aiosqlite
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            if event_type:
+                async with db.execute(
+                    "SELECT * FROM strategy_events WHERE strategy_name=? AND event_type=? "
+                    "ORDER BY recorded_at DESC LIMIT ?",
+                    (strategy_name, event_type, limit),
+                ) as cur:
+                    rows = await cur.fetchall()
+            else:
+                async with db.execute(
+                    "SELECT * FROM strategy_events WHERE strategy_name=? "
+                    "ORDER BY recorded_at DESC LIMIT ?",
+                    (strategy_name, limit),
+                ) as cur:
+                    rows = await cur.fetchall()
+            result = []
+            for r in rows:
+                d = dict(r)
+                if d.get("payload_json"):
+                    d["payload"] = json.loads(d.pop("payload_json"))
+                result.append(d)
+            return result
+
+
+# Singleton
+_db: Optional[StrategyDB] = None
+
+
+def get_strategy_db(data_dir: Optional[Path] = None) -> StrategyDB:
+    global _db
+    if _db is None:
+        if data_dir is None:
+            import os
+            data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
+        _db = StrategyDB(data_dir / "dexorder.db")
+    return _db
--- a/sandbox/dexorder/strategy/event_bridge.py
+++ b/sandbox/dexorder/strategy/event_bridge.py
@@ -0,0 +1,152 @@
+"""
+StrategyEventBridge — receives internal strategy events from subprocesses
+and forwards them to the user-facing EventPublisher.
+
+Architecture:
+    Strategy subprocess ──PUSH──> [IPC socket] ──PULL──> StrategyEventBridge
+                                                            └─> EventPublisher
+                                                                  ├── XPUB (informational)
+                                                                  └── DEALER (critical)
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+import time
+from typing import Optional
+
+import zmq
+import zmq.asyncio
+
+from .events import StrategyEvent, StrategyEventType, IPC_ENDPOINT
+
+log = logging.getLogger(__name__)
+
+# How long without a heartbeat before a strategy is considered dead (seconds)
+HEARTBEAT_TIMEOUT = 60.0
+
+
+class StrategyEventBridge:
+    """
+    Binds a ZMQ PULL socket and relays strategy events to EventPublisher.
+
+    Also monitors heartbeats to detect crashed strategy subprocesses.
+    """
+
+    def __init__(self, event_publisher, strategy_lifecycle=None):
+        """
+        Args:
+            event_publisher: dexorder.events.publisher.EventPublisher instance
+            strategy_lifecycle: StrategyLifecycleManager (optional) for marking crashed strategies
+        """
+        self._publisher = event_publisher
+        self._lifecycle = strategy_lifecycle
+        self._ctx: Optional[zmq.asyncio.Context] = None
+        self._socket: Optional[zmq.asyncio.Socket] = None
+        self._task: Optional[asyncio.Task] = None
+        self._heartbeat_task: Optional[asyncio.Task] = None
+        self._last_heartbeat: dict[str, float] = {}  # strategy_name -> timestamp
+        self._running = False
+
+    async def start(self) -> None:
+        """Bind PULL socket and start receive loop."""
+        self._ctx = zmq.asyncio.Context.instance()
+        self._socket = self._ctx.socket(zmq.PULL)
+        self._socket.bind(IPC_ENDPOINT)
+        self._running = True
+        self._task = asyncio.create_task(self._receive_loop())
+        self._heartbeat_task = asyncio.create_task(self._heartbeat_monitor())
+        log.info("StrategyEventBridge started on %s", IPC_ENDPOINT)
+
+    async def stop(self) -> None:
+        """Stop receive loop and close socket."""
+        self._running = False
+        for task in [self._task, self._heartbeat_task]:
+            if task:
+                task.cancel()
+                try:
+                    await task
+                except asyncio.CancelledError:
+                    pass
+        if self._socket:
+            self._socket.close()
+        log.info("StrategyEventBridge stopped")
+
+    def notify_strategy_started(self, strategy_name: str) -> None:
+        """Called by lifecycle manager when a strategy subprocess starts."""
+        self._last_heartbeat[strategy_name] = time.time()
+
+    def notify_strategy_stopped(self, strategy_name: str) -> None:
+        """Called by lifecycle manager when a strategy is deactivated."""
+        self._last_heartbeat.pop(strategy_name, None)
+
+    async def _receive_loop(self) -> None:
+        while self._running:
+            try:
+                raw = await asyncio.wait_for(self._socket.recv(), timeout=1.0)
+                event = StrategyEvent.deserialize(raw)
+                await self._handle_event(event)
+            except asyncio.TimeoutError:
+                continue
+            except asyncio.CancelledError:
+                raise
+            except Exception as e:
+                log.error("Error receiving strategy event: %s", e)
+
+    async def _handle_event(self, event: StrategyEvent) -> None:
+        """Translate internal StrategyEvent to UserEvent and publish."""
+        from dexorder.events.types import EventType, Priority
+        from dexorder.events.publisher import UserEvent, DeliverySpec
+
+        name = event.strategy_name
+
+        if event.event_type == StrategyEventType.HEARTBEAT:
+            self._last_heartbeat[name] = time.time()
+            return  # heartbeats are not forwarded to the user
+
+        # Map to UserEvent types
+        type_map = {
+            StrategyEventType.STARTED:          (EventType.STRATEGY_STARTED,   Priority.INFORMATIONAL),
+            StrategyEventType.STOPPED:          (EventType.STRATEGY_STOPPED,   Priority.INFORMATIONAL),
+            StrategyEventType.ORDER_SUBMITTED:  (EventType.ORDER_PLACED,       Priority.NORMAL),
+            StrategyEventType.ORDER_FILLED:     (EventType.ORDER_FILLED,       Priority.CRITICAL),
+            StrategyEventType.POSITION_UPDATE:  (EventType.POSITION_UPDATED,   Priority.INFORMATIONAL),
+            StrategyEventType.PNL_UPDATE:       (EventType.STRATEGY_LOG,       Priority.INFORMATIONAL),
+            StrategyEventType.ERROR:            (EventType.STRATEGY_ERROR,     Priority.CRITICAL),
+            StrategyEventType.LOG:              (EventType.STRATEGY_LOG,       Priority.INFORMATIONAL),
+        }
+        et, priority = type_map.get(event.event_type, (EventType.STRATEGY_LOG, Priority.INFORMATIONAL))
+
+        payload = {"strategy_name": name, **event.payload}
+
+        delivery = (
+            DeliverySpec.critical() if priority == Priority.CRITICAL
+            else DeliverySpec.informational()
+        )
+
+        try:
+            from dexorder.events.types import UserEvent as UE
+            await self._publisher.publish(UE(
+                event_type=et,
+                payload=payload,
+                delivery=delivery,
+            ))
+        except Exception as e:
+            log.error("Failed to publish strategy event %s: %s", event.event_type, e)
+
+    async def _heartbeat_monitor(self) -> None:
+        """Periodically check for strategies that stopped sending heartbeats."""
+        while self._running:
+            try:
+                await asyncio.sleep(30)
+                now = time.time()
+                for name, last_seen in list(self._last_heartbeat.items()):
+                    if now - last_seen > HEARTBEAT_TIMEOUT:
+                        log.warning("Strategy '%s' missed heartbeat, marking as crashed", name)
+                        self._last_heartbeat.pop(name, None)
+                        if self._lifecycle:
+                            await self._lifecycle.mark_crashed(name)
+            except asyncio.CancelledError:
+                raise
+            except Exception as e:
+                log.error("Heartbeat monitor error: %s", e)
--- a/sandbox/dexorder/strategy/events.py
+++ b/sandbox/dexorder/strategy/events.py
@@ -0,0 +1,61 @@
+"""
+Internal strategy event types for subprocess → main-process communication.
+
+Strategy subprocesses push StrategyEvents via ZMQ PUSH socket.
+The main process's StrategyEventBridge receives them via PULL and forwards
+them to the user-facing EventPublisher (dexorder/events/publisher.py).
+"""
+from __future__ import annotations
+
+import json
+import time
+import uuid
+from dataclasses import dataclass, field
+from enum import IntEnum
+
+
+class StrategyEventType(IntEnum):
+    """Internal event types produced by strategy subprocesses."""
+    STARTED        = 1
+    STOPPED        = 2
+    HEARTBEAT      = 3
+    ORDER_SUBMITTED = 10
+    ORDER_FILLED   = 11
+    POSITION_UPDATE = 20
+    PNL_UPDATE     = 21
+    ERROR          = 30
+    LOG            = 31
+
+
+@dataclass
+class StrategyEvent:
+    """Internal event envelope sent from strategy subprocess to main process."""
+    event_type: StrategyEventType
+    strategy_name: str
+    payload: dict
+    timestamp: float = field(default_factory=time.time)
+    event_id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
+
+    def serialize(self) -> bytes:
+        return json.dumps({
+            "event_type": int(self.event_type),
+            "strategy_name": self.strategy_name,
+            "payload": self.payload,
+            "timestamp": self.timestamp,
+            "event_id": self.event_id,
+        }).encode()
+
+    @classmethod
+    def deserialize(cls, data: bytes) -> "StrategyEvent":
+        d = json.loads(data.decode())
+        return cls(
+            event_type=StrategyEventType(d["event_type"]),
+            strategy_name=d["strategy_name"],
+            payload=d.get("payload", {}),
+            timestamp=d.get("timestamp", time.time()),
+            event_id=d.get("event_id", ""),
+        )
+
+
+# IPC endpoint used for strategy subprocess → main process communication
+IPC_ENDPOINT = "ipc:///tmp/dexorder-strategy-events.sock"
--- a/sandbox/dexorder/strategy/lifecycle.py
+++ b/sandbox/dexorder/strategy/lifecycle.py
@@ -0,0 +1,322 @@
+"""
+StrategyLifecycleManager — manages running strategy subprocesses.
+
+Responsibilities:
+- Starting strategy subprocesses from git worktrees
+- Stopping subprocesses on deactivation
+- Persisting state to SQLite for crash recovery
+- Registering strategies as LifecycleManager triggers (prevents idle shutdown)
+- Enforcing max concurrent strategy limit
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import threading
+import time
+from pathlib import Path
+from typing import Optional
+
+log = logging.getLogger(__name__)
+
+MAX_CONCURRENT_STRATEGIES = 5
+DEFAULT_POLL_INTERVAL = 60  # seconds between bar checks
+
+
+class StrategyLifecycleManager:
+
+    def __init__(self, data_dir: Path, event_bridge=None, lifecycle_manager=None):
+        self.data_dir = data_dir
+        self.worktrees_dir = data_dir / "worktrees"
+        self.configs_dir = data_dir / "strategy_configs"
+        self._bridge = event_bridge
+        self._lifecycle = lifecycle_manager  # dexorder LifecycleManager
+        self._runners: dict[str, tuple[threading.Thread, threading.Event]] = {}  # name -> (thread, stop_event)
+        self._db: Optional["StrategyDB"] = None
+
+    async def initialize(self) -> None:
+        """Initialize DB and prune stale worktrees."""
+        from dexorder.strategy.db import get_strategy_db
+        from dexorder.tools.python_tools import get_category_manager
+
+        self._db = get_strategy_db(self.data_dir)
+        await self._db.initialize()
+        self.worktrees_dir.mkdir(parents=True, exist_ok=True)
+        self.configs_dir.mkdir(parents=True, exist_ok=True)
+
+        # Prune any git worktrees that are no longer registered
+        try:
+            mgr = get_category_manager(self.data_dir)
+            mgr.git.prune_worktrees()
+        except Exception as e:
+            log.warning("git worktree prune failed: %s", e)
+
+    async def resume_running(self) -> None:
+        """On container restart, re-launch strategies that were 'running' at shutdown."""
+        if self._db is None:
+            return
+        try:
+            running = await self._db.get_running_strategies()
+            for row in running:
+                name = row["name"]
+                log.info("Resuming strategy '%s' after container restart", name)
+                feeds = json.loads(row.get("feeds_json") or "[]")
+                await self.activate(
+                    strategy_name=name,
+                    feeds=feeds,
+                    allocation=row.get("allocation", 10_000.0),
+                    paper=bool(row.get("paper", 1)),
+                    _resume=True,
+                )
+        except Exception as e:
+            log.error("Failed to resume strategies: %s", e)
+
+    # ------------------------------------------------------------------
+    # Activate / Deactivate
+    # ------------------------------------------------------------------
+
+    async def activate(
+        self,
+        strategy_name: str,
+        feeds: list[dict],
+        allocation: float,
+        paper: bool = True,
+        git_revision: str = "HEAD",
+        _resume: bool = False,
+    ) -> dict:
+        """
+        Activate a strategy.
+
+        Creates a git worktree at the given revision, writes a config file,
+        and spawns a subprocess running runner.py.
+
+        Returns a dict with status and details.
+        """
+        if strategy_name in self._runners:
+            return {"error": f"Strategy '{strategy_name}' is already running"}
+
+        if len(self._runners) >= MAX_CONCURRENT_STRATEGIES:
+            return {
+                "error": f"Maximum concurrent strategies ({MAX_CONCURRENT_STRATEGIES}) reached. "
+                         "Deactivate a running strategy first."
+            }
+
+        # Build worktree
+        from dexorder.tools.python_tools import get_category_manager, sanitize_name
+        mgr = get_category_manager(self.data_dir)
+
+        safe_name = sanitize_name(strategy_name)
+        impl_path = self.data_dir / "src" / "strategy" / safe_name / "implementation.py"
+        if not impl_path.exists():
+            return {"error": f"Strategy '{strategy_name}' not found at {impl_path}"}
+
+        try:
+            short_hash = mgr.git.head_short_hash() if git_revision == "HEAD" else git_revision[:7]
+            worktree_name = f"{safe_name}_{short_hash}"
+            worktree_path = self.worktrees_dir / worktree_name
+
+            if not worktree_path.exists():
+                actual_hash = mgr.git.create_worktree(worktree_path, git_revision)
+            else:
+                actual_hash = short_hash
+        except Exception as e:
+            return {"error": f"Failed to create git worktree: {e}"}
+
+        worktree_impl = worktree_path / "src" / "strategy" / safe_name / "implementation.py"
+        if not worktree_impl.exists():
+            # Fall back to live impl (worktree may not include subdirs on first use)
+            worktree_impl = impl_path
+
+        # Feed configs as list of [ticker, period_seconds]
+        feed_configs = [[f.get("symbol", ""), int(f.get("period_seconds", 3600))] for f in feeds]
+
+        # Write runner config to a temp file under DATA_DIR
+        runner_config = {
+            "strategy_name": strategy_name,
+            "impl_path": str(worktree_impl),
+            "feed_configs": feed_configs,
+            "allocation": allocation,
+            "ipc_endpoint": "ipc:///tmp/dexorder-strategy-events.sock",
+            "data_dir": str(self.data_dir),
+            "poll_interval": DEFAULT_POLL_INTERVAL,
+        }
+        config_file = self.configs_dir / f"{safe_name}.json"
+        config_file.write_text(json.dumps(runner_config, indent=2))
+
+        # Launch strategy in a daemon thread
+        try:
+            from dexorder.strategy.runner import run_thread
+            stop_event = threading.Event()
+            thread = threading.Thread(
+                target=run_thread,
+                args=(runner_config, stop_event),
+                daemon=True,
+                name=f"strategy-{safe_name}",
+            )
+            thread.start()
+        except Exception as e:
+            return {"error": f"Failed to start strategy thread: {e}"}
+
+        self._runners[strategy_name] = (thread, stop_event)
+
+        # Register as lifecycle trigger
+        if self._lifecycle:
+            self._lifecycle.add_trigger(f"strategy:{strategy_name}")
+
+        # Notify event bridge
+        if self._bridge:
+            self._bridge.notify_strategy_started(strategy_name)
+
+        # Persist to DB
+        if self._db:
+            await self._db.upsert_strategy(
+                name=strategy_name,
+                status="running",
+                allocation=allocation,
+                paper=paper,
+                feeds=feeds,
+                git_rev=actual_hash,
+                worktree_path=str(worktree_path),
+                config=runner_config,
+            )
+
+        log.info("Strategy '%s' activated (thread=%d, rev=%s)", strategy_name, thread.ident, actual_hash)
+        return {
+            "status": "activated",
+            "strategy_name": strategy_name,
+            "paper": paper,
+            "allocation": allocation,
+            "git_revision": actual_hash,
+            "thread_id": thread.ident,
+        }
+
+    async def deactivate(self, strategy_name: str) -> dict:
+        """Stop a running strategy and clean up its worktree."""
+        entry = self._runners.pop(strategy_name, None)
+        if entry is None:
+            return {"error": f"Strategy '{strategy_name}' is not running"}
+
+        thread, stop_event = entry
+
+        # Signal the runner to stop and wait for the thread to exit
+        stop_event.set()
+        await asyncio.get_event_loop().run_in_executor(
+            None, lambda: thread.join(timeout=15)
+        )
+        if thread.is_alive():
+            log.warning("Strategy '%s' thread did not exit within timeout", strategy_name)
+
+        # Remove lifecycle trigger
+        if self._lifecycle:
+            self._lifecycle.remove_trigger(f"strategy:{strategy_name}")
+
+        # Notify bridge
+        if self._bridge:
+            self._bridge.notify_strategy_stopped(strategy_name)
+
+        # Get final PnL from DB
+        final_pnl = 0.0
+        if self._db:
+            state = await self._db.get_pnl_state(strategy_name)
+            if state:
+                final_pnl = state.get("realized_pnl", 0.0)
+            await self._db.update_strategy_status(strategy_name, "stopped")
+
+        # Clean up worktree
+        await self._cleanup_worktree(strategy_name)
+
+        log.info("Strategy '%s' deactivated, final_pnl=%.4f", strategy_name, final_pnl)
+        return {
+            "status": "deactivated",
+            "strategy_name": strategy_name,
+            "final_pnl": final_pnl,
+        }
+
+    async def mark_crashed(self, strategy_name: str) -> None:
+        """Mark a strategy as crashed (called by heartbeat monitor)."""
+        self._runners.pop(strategy_name, None)
+        if self._lifecycle:
+            self._lifecycle.remove_trigger(f"strategy:{strategy_name}")
+        if self._db:
+            await self._db.update_strategy_status(strategy_name, "error")
+        log.error("Strategy '%s' marked as crashed (heartbeat timeout)", strategy_name)
+
+    async def update_pnl(self, strategy_name: str, payload: dict) -> None:
+        """Called by event bridge when a PNL_UPDATE event arrives."""
+        if self._db:
+            await self._db.update_pnl_state(
+                name=strategy_name,
+                realized_pnl=payload.get("realized_pnl", 0.0),
+                unrealized_pnl=payload.get("unrealized_pnl", 0.0),
+                trade_count=payload.get("trade_count", 0),
+            )
+
+    # ------------------------------------------------------------------
+    # Listing
+    # ------------------------------------------------------------------
+
+    async def list_active(self) -> list[dict]:
+        """Return currently running strategies with PnL state."""
+        if self._db is None:
+            return []
+        strategies = await self._db.get_running_strategies()
+        result = []
+        for s in strategies:
+            name = s["name"]
+            state = await self._db.get_pnl_state(name)
+            entry = {
+                "strategy_name": name,
+                "status": s.get("status", "unknown"),
+                "paper": bool(s.get("paper", 1)),
+                "allocation": s.get("allocation", 0),
+                "git_revision": s.get("git_rev"),
+                "started_at": s.get("started_at"),
+                "feeds": json.loads(s.get("feeds_json") or "[]"),
+                "realized_pnl": state.get("realized_pnl", 0.0) if state else 0.0,
+                "unrealized_pnl": state.get("unrealized_pnl", 0.0) if state else 0.0,
+                "trade_count": state.get("trade_count", 0) if state else 0,
+            }
+            result.append(entry)
+        return result
+
+    # ------------------------------------------------------------------
+    # Shutdown
+    # ------------------------------------------------------------------
+
+    async def shutdown(self) -> None:
+        """Stop all running strategies on container shutdown."""
+        names = list(self._runners.keys())
+        for name in names:
+            await self.deactivate(name)
+
+    # ------------------------------------------------------------------
+    # Internal
+    # ------------------------------------------------------------------
+
+    async def _cleanup_worktree(self, strategy_name: str) -> None:
+        if self._db is None:
+            return
+        try:
+            row = await self._db.get_strategy(strategy_name)
+            wt = row.get("worktree_path") if row else None
+            if wt:
+                from dexorder.tools.python_tools import get_category_manager
+                mgr = get_category_manager(self.data_dir)
+                mgr.git.remove_worktree(Path(wt))
+        except Exception as e:
+            log.warning("Worktree cleanup failed for '%s': %s", strategy_name, e)
+
+
+# Singleton
+_lifecycle_manager: Optional[StrategyLifecycleManager] = None
+
+
+def get_strategy_lifecycle(data_dir: Optional[Path] = None) -> StrategyLifecycleManager:
+    global _lifecycle_manager
+    if _lifecycle_manager is None:
+        if data_dir is None:
+            import os
+            data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
+        _lifecycle_manager = StrategyLifecycleManager(data_dir)
+    return _lifecycle_manager
--- a/sandbox/dexorder/strategy/paper_account.py
+++ b/sandbox/dexorder/strategy/paper_account.py
@@ -0,0 +1,196 @@
+"""
+Lightweight paper trading account for strategy subprocesses.
+
+Simulates order execution at bar-close prices without requiring Nautilus TradingNode.
+Tracks positions, PnL, and trade history. All amounts are in the quote currency.
+"""
+from __future__ import annotations
+
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class Position:
+    """An open position."""
+    instrument: str
+    side: str          # "long" or "short"
+    quantity: float
+    entry_price: float
+    entry_time: float  # Unix timestamp
+
+
+@dataclass
+class Trade:
+    """A completed round-trip trade."""
+    instrument: str
+    side: str          # direction of the entry
+    quantity: float
+    entry_price: float
+    exit_price: float
+    entry_time: float
+    exit_time: float
+    pnl: float
+
+
+class PaperAccount:
+    """
+    Simulates a cash paper account for a single strategy.
+
+    Positions are opened/closed by calling buy(), sell(), and flatten().
+    Fills execute at the provided price (e.g. bar close).
+    """
+
+    def __init__(self, initial_capital: float, feed_key: Optional[str] = None):
+        self.initial_capital = initial_capital
+        self.balance = initial_capital
+        self._positions: dict[str, Position] = {}  # feed_key → Position
+        self._trades: list[Trade] = []
+        self._default_feed_key = feed_key
+
+    # ------------------------------------------------------------------
+    # Order API (mirrors PandasStrategy's order API)
+    # ------------------------------------------------------------------
+
+    def buy(self, quantity: float, price: float, feed_key: Optional[str] = None) -> None:
+        """Open a long or close a short at price."""
+        fk = feed_key or self._default_feed_key or "default"
+        existing = self._positions.get(fk)
+
+        if existing and existing.side == "short":
+            # Close short
+            pnl = (existing.entry_price - price) * existing.quantity
+            self._close_position(fk, price, pnl)
+        elif not existing:
+            # Open long
+            cost = price * quantity
+            if cost > self.balance:
+                quantity = self.balance / price  # size down to available capital
+            if quantity > 0:
+                self._positions[fk] = Position(
+                    instrument=fk, side="long", quantity=quantity,
+                    entry_price=price, entry_time=time.time(),
+                )
+                log.debug("Paper BUY %.6f @ %.2f (%s)", quantity, price, fk)
+
+    def sell(self, quantity: float, price: float, feed_key: Optional[str] = None) -> None:
+        """Open a short or close a long at price."""
+        fk = feed_key or self._default_feed_key or "default"
+        existing = self._positions.get(fk)
+
+        if existing and existing.side == "long":
+            # Close long
+            pnl = (price - existing.entry_price) * existing.quantity
+            self._close_position(fk, price, pnl)
+        elif not existing:
+            # Open short (using margin — simplified: require 2x capital)
+            cost = price * quantity * 2
+            if cost > self.balance:
+                quantity = self.balance / (price * 2)
+            if quantity > 0:
+                self._positions[fk] = Position(
+                    instrument=fk, side="short", quantity=quantity,
+                    entry_price=price, entry_time=time.time(),
+                )
+                log.debug("Paper SELL %.6f @ %.2f (%s)", quantity, price, fk)
+
+    def flatten(self, price: float, feed_key: Optional[str] = None) -> None:
+        """Close any open position at price."""
+        if feed_key:
+            keys = [feed_key]
+        else:
+            keys = list(self._positions.keys())
+
+        for fk in keys:
+            pos = self._positions.get(fk)
+            if pos is None:
+                continue
+            if pos.side == "long":
+                pnl = (price - pos.entry_price) * pos.quantity
+            else:
+                pnl = (pos.entry_price - price) * pos.quantity
+            self._close_position(fk, price, pnl)
+
+    # ------------------------------------------------------------------
+    # Reporting
+    # ------------------------------------------------------------------
+
+    def unrealized_pnl(self, current_prices: dict[str, float]) -> float:
+        """Compute unrealized PnL using current prices."""
+        total = 0.0
+        for fk, pos in self._positions.items():
+            price = current_prices.get(fk)
+            if price is None:
+                continue
+            if pos.side == "long":
+                total += (price - pos.entry_price) * pos.quantity
+            else:
+                total += (pos.entry_price - price) * pos.quantity
+        return total
+
+    def realized_pnl(self) -> float:
+        return sum(t.pnl for t in self._trades)
+
+    def total_pnl(self, current_prices: dict[str, float] | None = None) -> float:
+        rpnl = self.realized_pnl()
+        upnl = self.unrealized_pnl(current_prices) if current_prices else 0.0
+        return rpnl + upnl
+
+    def trade_count(self) -> int:
+        return len(self._trades)
+
+    def win_rate(self) -> float:
+        if not self._trades:
+            return 0.0
+        wins = sum(1 for t in self._trades if t.pnl > 0)
+        return wins / len(self._trades)
+
+    def positions(self) -> dict[str, dict]:
+        return {
+            fk: {
+                "side": p.side,
+                "quantity": p.quantity,
+                "entry_price": p.entry_price,
+            }
+            for fk, p in self._positions.items()
+        }
+
+    def recent_trades(self, n: int = 50) -> list[dict]:
+        return [
+            {
+                "instrument": t.instrument,
+                "side": t.side,
+                "quantity": round(t.quantity, 8),
+                "entry_price": round(t.entry_price, 8),
+                "exit_price": round(t.exit_price, 8),
+                "entry_time": t.entry_time,
+                "exit_time": t.exit_time,
+                "pnl": round(t.pnl, 6),
+            }
+            for t in self._trades[-n:]
+        ]
+
+    # ------------------------------------------------------------------
+    # Internal
+    # ------------------------------------------------------------------
+
+    def _close_position(self, fk: str, price: float, pnl: float) -> None:
+        pos = self._positions.pop(fk, None)
+        if pos is None:
+            return
+        self.balance += pnl
+        self._trades.append(Trade(
+            instrument=fk,
+            side=pos.side,
+            quantity=pos.quantity,
+            entry_price=pos.entry_price,
+            exit_price=price,
+            entry_time=pos.entry_time,
+            exit_time=time.time(),
+            pnl=pnl,
+        ))
+        log.debug("Paper trade closed: pnl=%.4f balance=%.2f (%s)", pnl, self.balance, fk)
--- a/sandbox/dexorder/strategy/runner.py
+++ b/sandbox/dexorder/strategy/runner.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+"""
+Strategy subprocess runner.
+
+Loads a PandasStrategy from a git worktree path, subscribes to live bar data
+(polling DataAPI), runs the paper trading loop, and pushes events to the main
+MCP process via ZMQ PUSH.
+
+Usage:
+    python -m dexorder.strategy.runner --config <json_config_path>
+
+Config JSON:
+    {
+        "strategy_name": "My RSI Strategy",
+        "impl_path": "/app/data/worktrees/my_rsi_strategy_abc1234/strategy/my_rsi_strategy/implementation.py",
+        "feed_configs": [["BTC/USDT.BINANCE", 3600]],
+        "allocation": 5000.0,
+        "ipc_endpoint": "ipc:///tmp/dexorder-strategy-events.sock",
+        "data_dir": "/app/data",
+        "poll_interval": 60
+    }
+"""
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+import sys
+import threading
+import time
+from pathlib import Path
+
+# Ensure the worktree's parent (which contains dexorder package) is on the path.
+# Also ensure the original dexorder package is importable.
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+log = logging.getLogger(__name__)
+
+
+class StrategyRunner:
+    """Runs a PandasStrategy in paper trading mode using DataAPI polling."""
+
+    def __init__(self, config: dict, stop_event: threading.Event | None = None):
+        self.strategy_name = config["strategy_name"]
+        self.impl_path = Path(config["impl_path"])
+        self.feed_configs: list[tuple[str, int]] = [
+            (f[0], int(f[1])) for f in config["feed_configs"]
+        ]
+        self.allocation = float(config.get("allocation", 10_000.0))
+        self.ipc_endpoint = config.get("ipc_endpoint", "ipc:///tmp/dexorder-strategy-events.sock")
+        self.data_dir = Path(config.get("data_dir", "/app/data"))
+        self.poll_interval = int(config.get("poll_interval", 60))  # seconds
+
+        self._stop_event = stop_event or threading.Event()
+        self._running = False
+        self._push_socket = None
+        self._strategy = None
+        self._paper: "PaperAccount | None" = None
+        self._last_timestamps: dict[str, int] = {}  # feed_key -> last seen timestamp_ns
+
+    async def run(self) -> None:
+        """Main async entry point."""
+        self._setup_zmq()
+        await self._push_event("STARTED", {})
+
+        try:
+            await self._setup_strategy()
+            await self._trading_loop()
+        except asyncio.CancelledError:
+            pass
+        except Exception as e:
+            log.exception("Strategy runner fatal error")
+            await self._push_event("ERROR", {"message": str(e)})
+        finally:
+            await self._push_event("STOPPED", {
+                "pnl": self._paper.realized_pnl() if self._paper else 0.0,
+                "trade_count": self._paper.trade_count() if self._paper else 0,
+            })
+            self._cleanup_zmq()
+
+    # ------------------------------------------------------------------
+    # Setup
+    # ------------------------------------------------------------------
+
+    def _setup_zmq(self) -> None:
+        import zmq
+        ctx = zmq.Context.instance()
+        self._push_socket = ctx.socket(zmq.PUSH)
+        self._push_socket.connect(self.ipc_endpoint)
+        log.info("Connected PUSH socket to %s", self.ipc_endpoint)
+
+    def _cleanup_zmq(self) -> None:
+        if self._push_socket:
+            self._push_socket.close()
+
+    async def _setup_strategy(self) -> None:
+        from dexorder.nautilus.backtest_runner import _load_strategy_class, _setup_custom_indicators
+        from dexorder.nautilus.pandas_strategy import PandasStrategyConfig, make_feed_key
+        from dexorder.strategy.paper_account import PaperAccount
+
+        # Register custom indicators
+        try:
+            _setup_custom_indicators(self.data_dir)
+        except Exception as e:
+            log.warning("Custom indicator setup failed: %s", e)
+
+        # Load strategy class from worktree impl path
+        strategy_class = _load_strategy_class(self.impl_path)
+        log.info("Loaded strategy class: %s", strategy_class.__name__)
+
+        feed_keys = tuple(make_feed_key(t, p) for t, p in self.feed_configs)
+        config = PandasStrategyConfig(
+            strategy_id=f"{strategy_class.__name__}-PAPER",
+            feed_keys=feed_keys,
+            initial_capital=self.allocation,
+        )
+        self._strategy = strategy_class(config=config)
+        self._paper = PaperAccount(self.allocation, feed_keys[0] if feed_keys else None)
+
+        # Wire paper account into strategy's order methods
+        self._wire_paper_account(feed_keys)
+        log.info("Strategy '%s' initialized with %d feed(s)", self.strategy_name, len(feed_keys))
+
+    def _wire_paper_account(self, feed_keys: tuple) -> None:
+        """Replace strategy's order methods with paper account calls."""
+        paper = self._paper
+        from dexorder.nautilus.pandas_strategy import make_feed_key
+
+        def paper_buy(quantity, feed_key=None):
+            fk = feed_key or (feed_keys[0] if feed_keys else "default")
+            # Get current close price from last seen bars
+            price = self._current_price(fk)
+            if price:
+                paper.buy(quantity, price, fk)
+                asyncio.create_task(self._push_event("ORDER_FILLED", {
+                    "side": "buy", "quantity": quantity,
+                    "price": price, "feed_key": fk,
+                    "pnl": paper.realized_pnl(),
+                }))
+
+        def paper_sell(quantity, feed_key=None):
+            fk = feed_key or (feed_keys[0] if feed_keys else "default")
+            price = self._current_price(fk)
+            if price:
+                paper.sell(quantity, price, fk)
+                asyncio.create_task(self._push_event("ORDER_FILLED", {
+                    "side": "sell", "quantity": quantity,
+                    "price": price, "feed_key": fk,
+                    "pnl": paper.realized_pnl(),
+                }))
+
+        def paper_flatten(feed_key=None):
+            if feed_key:
+                fk_list = [feed_key]
+            else:
+                fk_list = list(feed_keys)
+            for fk in fk_list:
+                price = self._current_price(fk)
+                if price:
+                    paper.flatten(price, fk)
+
+        self._strategy.buy = paper_buy
+        self._strategy.sell = paper_sell
+        self._strategy.flatten = paper_flatten
+
+    # ------------------------------------------------------------------
+    # Trading loop
+    # ------------------------------------------------------------------
+
+    async def _trading_loop(self) -> None:
+        """Poll DataAPI for new bars and call strategy.evaluate() on each update."""
+        import pandas as pd
+        from dexorder.api import get_api
+        from dexorder.nautilus.pandas_strategy import make_feed_key
+
+        api = get_api()
+        accumulated: dict[str, list[dict]] = {
+            make_feed_key(t, p): [] for t, p in self.feed_configs
+        }
+        self._current_prices: dict[str, float] = {}
+
+        heartbeat_task = asyncio.create_task(self._heartbeat_loop())
+        self._running = True
+
+        try:
+            while self._running and not self._stop_event.is_set():
+                now = int(time.time())
+                updated_any = False
+
+                for ticker, period_seconds in self.feed_configs:
+                    fk = make_feed_key(ticker, period_seconds)
+                    last_ts_ns = self._last_timestamps.get(fk, 0)
+                    # Request last N bars to catch up
+                    lookback = now - max(last_ts_ns // 1_000_000_000, now - 7 * 24 * 3600)
+                    from_time = lookback if last_ts_ns == 0 else (last_ts_ns // 1_000_000_000)
+
+                    try:
+                        df = await api.data.historical_ohlc(
+                            ticker=ticker,
+                            period_seconds=period_seconds,
+                            start_time=from_time,
+                            end_time=now,
+                            extra_columns=["volume", "buy_vol", "sell_vol",
+                                           "open_time", "high_time", "low_time", "close_time",
+                                           "open_interest"],
+                        )
+                    except Exception as e:
+                        log.warning("OHLC fetch failed for %s: %s", fk, e)
+                        continue
+
+                    if df.empty:
+                        continue
+
+                    # Find new bars
+                    ts_col = "timestamp" if "timestamp" in df.columns else df.columns[0]
+                    new_bars = df[df[ts_col] > last_ts_ns] if last_ts_ns else df
+
+                    for _, row in new_bars.iterrows():
+                        ts_ns = int(row.get(ts_col, 0))
+                        entry = {
+                            "timestamp": ts_ns,
+                            "open": float(row.get("open", 0)),
+                            "high": float(row.get("high", 0)),
+                            "low": float(row.get("low", 0)),
+                            "close": float(row.get("close", 0)),
+                            "volume": float(row.get("volume", 0)),
+                            "buy_vol": float(row.get("buy_vol", 0)) if "buy_vol" in row else None,
+                            "sell_vol": float(row.get("sell_vol", 0)) if "sell_vol" in row else None,
+                            "open_interest": float(row.get("open_interest", 0)) if "open_interest" in row else None,
+                        }
+                        accumulated[fk].append(entry)
+                        self._last_timestamps[fk] = max(self._last_timestamps.get(fk, 0), ts_ns)
+                        self._current_prices[fk] = entry["close"]
+                        updated_any = True
+
+                if updated_any:
+                    # Build DataFrames and call evaluate
+                    dfs = {fk: pd.DataFrame(rows) for fk, rows in accumulated.items() if rows}
+                    try:
+                        self._strategy.evaluate(dfs)
+                    except Exception as e:
+                        log.error("evaluate() error: %s", e)
+                        await self._push_event("ERROR", {"message": f"evaluate() error: {e}"})
+
+                    # Push PnL update
+                    rpnl = self._paper.realized_pnl() if self._paper else 0.0
+                    upnl = self._paper.unrealized_pnl(self._current_prices) if self._paper else 0.0
+                    await self._push_event("PNL_UPDATE", {
+                        "realized_pnl": rpnl,
+                        "unrealized_pnl": upnl,
+                        "total_pnl": rpnl + upnl,
+                        "trade_count": self._paper.trade_count() if self._paper else 0,
+                    })
+
+                # Sleep in 1s increments so stop_event is checked promptly
+                for _ in range(self.poll_interval):
+                    if self._stop_event.is_set():
+                        self._running = False
+                        break
+                    await asyncio.sleep(1)
+        finally:
+            heartbeat_task.cancel()
+            try:
+                await heartbeat_task
+            except asyncio.CancelledError:
+                pass
+
+    async def _heartbeat_loop(self) -> None:
+        while True:
+            await asyncio.sleep(10)
+            await self._push_event("HEARTBEAT", {})
+
+    def _current_price(self, feed_key: str) -> float | None:
+        return getattr(self, "_current_prices", {}).get(feed_key)
+
+    # ------------------------------------------------------------------
+    # Event publishing
+    # ------------------------------------------------------------------
+
+    async def _push_event(self, event_type: str, payload: dict) -> None:
+        from dexorder.strategy.events import StrategyEvent, StrategyEventType
+        type_map = {
+            "STARTED": StrategyEventType.STARTED,
+            "STOPPED": StrategyEventType.STOPPED,
+            "HEARTBEAT": StrategyEventType.HEARTBEAT,
+            "ORDER_FILLED": StrategyEventType.ORDER_FILLED,
+            "POSITION_UPDATE": StrategyEventType.POSITION_UPDATE,
+            "PNL_UPDATE": StrategyEventType.PNL_UPDATE,
+            "ERROR": StrategyEventType.ERROR,
+            "LOG": StrategyEventType.LOG,
+        }
+        et = type_map.get(event_type, StrategyEventType.LOG)
+        event = StrategyEvent(
+            event_type=et,
+            strategy_name=self.strategy_name,
+            payload=payload,
+        )
+        try:
+            if self._push_socket:
+                self._push_socket.send(event.serialize(), flags=1)  # NOBLOCK
+        except Exception as e:
+            log.debug("Failed to push event %s: %s", event_type, e)
+
+
+def _init_api() -> None:
+    """Initialize thread-local API from environment config. Non-fatal on error."""
+    try:
+        import yaml
+        config_path = os.environ.get("CONFIG_PATH", "/app/config/config.yaml")
+        secrets_path = os.environ.get("SECRETS_PATH", "/app/config/secrets.yaml")
+        config_data, secrets_data = {}, {}
+        if Path(config_path).exists():
+            with open(config_path) as f:
+                config_data = yaml.safe_load(f) or {}
+        if Path(secrets_path).exists():
+            with open(secrets_path) as f:
+                secrets_data = yaml.safe_load(f) or {}
+
+        data_cfg = config_data.get("data", {})
+        iceberg_cfg = data_cfg.get("iceberg", {})
+        relay_cfg = data_cfg.get("relay", {})
+
+        from dexorder.api import set_api, API
+        from dexorder.impl.charting_api_impl import ChartingAPIImpl
+        from dexorder.impl.data_api_impl import DataAPIImpl
+
+        data_api = DataAPIImpl(
+            iceberg_catalog_uri=iceberg_cfg.get("catalog_uri", "http://iceberg-catalog:8181"),
+            relay_endpoint=relay_cfg.get("endpoint", "tcp://relay:5559"),
+            notification_endpoint=relay_cfg.get("notification_endpoint", "tcp://relay:5558"),
+            namespace=iceberg_cfg.get("namespace", "trading"),
+            s3_endpoint=iceberg_cfg.get("s3_endpoint") or secrets_data.get("s3_endpoint"),
+            s3_access_key=iceberg_cfg.get("s3_access_key") or secrets_data.get("s3_access_key"),
+            s3_secret_key=iceberg_cfg.get("s3_secret_key") or secrets_data.get("s3_secret_key"),
+        )
+        set_api(API(charting=ChartingAPIImpl(), data=data_api))
+    except Exception as e:
+        log.warning("API initialization failed: %s", e)
+
+
+def run_thread(config: dict, stop_event: threading.Event) -> None:
+    """
+    Entry point for running a strategy in a daemon thread.
+
+    Initializes a thread-local API, creates a StrategyRunner with the given
+    stop_event, and runs the async trading loop until stop_event is set.
+    """
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+    )
+    _init_api()
+
+    runner = StrategyRunner(config, stop_event=stop_event)
+
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        loop.run_until_complete(runner.run())
+    finally:
+        loop.close()
+
+
+def main():
+    """Subprocess entry point (backward compatibility)."""
+    import signal
+
+    parser = argparse.ArgumentParser(description="Dexorder strategy subprocess runner")
+    parser.add_argument("--config", required=True, help="Path to JSON config file")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+    )
+
+    with open(args.config) as f:
+        config = json.load(f)
+
+    stop_event = threading.Event()
+
+    def _shutdown(signum, frame):
+        log.info("Received signal %d, stopping runner", signum)
+        stop_event.set()
+
+    signal.signal(signal.SIGTERM, _shutdown)
+    signal.signal(signal.SIGINT, _shutdown)
+
+    run_thread(config, stop_event)
+
+
+if __name__ == "__main__":
+    main()
--- a/sandbox/dexorder/tools/activate_strategy.py
+++ b/sandbox/dexorder/tools/activate_strategy.py
@@ -1,15 +1,14 @@
 """
-activate_strategy / deactivate_strategy — start and stop live or paper trading.
+activate_strategy / deactivate_strategy / list_active_strategies

 paper=True  (default): forward paper trading — strategy runs on live data with
-            simulated fills. No API keys required.
+            simulated fills via PaperAccount.

-paper=False: live trading — real order execution via user's exchange API keys,
-             retrieved from the user secrets vault. Currently raises
-             NotImplementedError until the vault is implemented.
+paper=False: live trading — not yet implemented (requires secrets vault).

-Full live-data feed streaming for forward testing is TBD (requires a live bar
-source). This module establishes the interface and stubs the runtime loop.
+Each activated strategy runs in its own subprocess from a git worktree,
+ensuring the production version is isolated from edits in the working tree.
+Events (fills, PnL updates, errors) flow via ZMQ PUSH/PULL to EventPublisher.
 """

 import json
@@ -18,10 +17,6 @@ from typing import Any

 log = logging.getLogger(__name__)

-# Registry of active strategies: {strategy_name → runtime state dict}
-# In a future implementation this will hold live strategy runners.
-_active_strategies: dict[str, dict] = {}
-

 async def activate_strategy(
    strategy_name: str,
@@ -34,16 +29,14 @@ async def activate_strategy(

    Args:
        strategy_name: Display name as saved via python_write("strategy", ...)
-        feeds: List of feed dicts, e.g. [{"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600}]
+        feeds: List of feed dicts: [{"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600}]
        allocation: Capital allocated in quote currency (e.g. 5000.0 USDT)
-        paper: True = paper/simulated fills (default); False = live execution
+        paper: True = paper/simulated fills (default); False = live (not yet implemented)

    Returns:
        list[TextContent] with JSON:
-        {"status": "activated", "strategy_name": str, "paper": bool, "allocation": float}
-
-        On error:
-        {"error": str}
+        {"status": "activated", "strategy_name": str, "paper": bool, "allocation": float,
+         "git_revision": str, "pid": int}
    """
    from mcp.types import TextContent

@@ -51,87 +44,45 @@ async def activate_strategy(
        log.error("activate_strategy '%s': %s", strategy_name, msg)
        return [TextContent(type="text", text=json.dumps({"error": msg}))]

-    if strategy_name in _active_strategies:
+    if not paper:
        return _err(
-            f"Strategy '{strategy_name}' is already active. "
-            "Call deactivate_strategy first."
+            "Live trading (paper=False) requires the user secrets vault, "
+            "which is not yet implemented. Use paper=True for paper forward testing."
        )

-    if not paper:
-        # Live execution requires the user secrets vault for API keys.
-        # The vault is not yet implemented.
-        try:
-            from dexorder.secrets_vault import SecretsVault
-            _vault = SecretsVault()
-            _vault.get_secret("__probe__")  # will raise NotImplementedError
-        except NotImplementedError:
-            return _err(
-                "Live trading (paper=False) requires the user secrets vault, "
-                "which is not yet implemented. Use paper=True for paper forward testing."
-            )
-
-    # Validate feeds
    if not feeds:
        return _err("feeds list is empty")

-    parsed_feeds: list[tuple[str, int]] = []
    for f in feeds:
-        sym = f.get("symbol", "")
-        ps = f.get("period_seconds", 3600)
-        if not sym:
+        if not f.get("symbol"):
            return _err(f"Feed entry missing 'symbol': {f}")
-        parsed_feeds.append((sym, int(ps)))

-    # TODO: Full implementation — start a live/paper trading loop:
-    # 1. Load strategy class from category files
-    # 2. Set up custom indicators via _setup_custom_indicators()
-    # 3. Subscribe to live bar stream for each feed
-    # 4. Initialize paper account (Nautilus SimulatedExchange) or live account
-    # 5. Run strategy event loop (on_bar → evaluate → submit orders)
-    # This requires a live data feed adapter (TBD).
+    try:
+        from dexorder.strategy.lifecycle import get_strategy_lifecycle
+        lifecycle = get_strategy_lifecycle()
+        result = await lifecycle.activate(
+            strategy_name=strategy_name,
+            feeds=feeds,
+            allocation=allocation,
+            paper=paper,
+        )
+    except Exception as exc:
+        log.exception("activate_strategy: lifecycle activation failed")
+        return _err(f"Activation failed: {exc}")

-    log.info(
-        "activate_strategy: registering '%s' (paper=%s, allocation=%.2f) — "
-        "live feed loop is TBD",
-        strategy_name, paper, allocation,
-    )
+    if "error" in result:
+        return _err(result["error"])

-    _active_strategies[strategy_name] = {
-        "strategy_name": strategy_name,
-        "feeds": [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
-        "allocation": allocation,
-        "paper": paper,
-        "status": "registered",
-        "pnl": 0.0,
-    }
-
-    payload = {
-        "status":          "activated",
-        "strategy_name":   strategy_name,
-        "paper":           paper,
-        "allocation":      allocation,
-        "feeds":           [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
-        "note":            (
-            "Strategy registered. Live data feed streaming is not yet implemented — "
-            "forward trading will begin when the live feed adapter is available."
-        ),
-    }
-    return [TextContent(type="text", text=json.dumps(payload))]
+    return [TextContent(type="text", text=json.dumps(result))]


 async def deactivate_strategy(strategy_name: str) -> list:
    """
    Deactivate a running strategy and return its final P&L summary.

-    Args:
-        strategy_name: Display name of the active strategy
-
    Returns:
        list[TextContent] with JSON:
        {"status": "deactivated", "strategy_name": str, "final_pnl": float}
-
-        On error:
-        {"error": str}
    """
    from mcp.types import TextContent

@@ -139,35 +90,36 @@ async def deactivate_strategy(strategy_name: str) -> list:
        log.error("deactivate_strategy '%s': %s", strategy_name, msg)
        return [TextContent(type="text", text=json.dumps({"error": msg}))]

-    if strategy_name not in _active_strategies:
-        return _err(f"Strategy '{strategy_name}' is not active")
+    try:
+        from dexorder.strategy.lifecycle import get_strategy_lifecycle
+        lifecycle = get_strategy_lifecycle()
+        result = await lifecycle.deactivate(strategy_name)
+    except Exception as exc:
+        log.exception("deactivate_strategy: failed")
+        return _err(f"Deactivation failed: {exc}")

-    state = _active_strategies.pop(strategy_name)
+    if "error" in result:
+        return _err(result["error"])

-    # TODO: Stop the live feed loop and collect final P&L from the running engine.
-    final_pnl = state.get("pnl", 0.0)
-
-    log.info("deactivate_strategy: stopped '%s', final_pnl=%.4f", strategy_name, final_pnl)
-
-    payload = {
-        "status":        "deactivated",
-        "strategy_name": strategy_name,
-        "final_pnl":     final_pnl,
-    }
-    return [TextContent(type="text", text=json.dumps(payload))]
+    return [TextContent(type="text", text=json.dumps(result))]


 async def list_active_strategies() -> list:
    """
-    Return a list of currently active strategies and their status.
+    Return a list of currently active strategies with PnL state.

    Returns:
        list[TextContent] with JSON:
-        {"active_strategies": [{strategy_name, paper, allocation, feeds, pnl}, ...]}
+        {"active_strategies": [{strategy_name, paper, allocation, feeds, realized_pnl, ...}]}
    """
    from mcp.types import TextContent

-    payload = {
-        "active_strategies": list(_active_strategies.values()),
-    }
-    return [TextContent(type="text", text=json.dumps(payload))]
+    try:
+        from dexorder.strategy.lifecycle import get_strategy_lifecycle
+        lifecycle = get_strategy_lifecycle()
+        active = await lifecycle.list_active()
+    except Exception as exc:
+        log.exception("list_active_strategies: failed")
+        active = []
+
+    return [TextContent(type="text", text=json.dumps({"active_strategies": active}))]
--- a/sandbox/dexorder/tools/backtest_strategy.py
+++ b/sandbox/dexorder/tools/backtest_strategy.py
@@ -15,7 +15,11 @@ from typing import Any
 log = logging.getLogger(__name__)

 # All OHLC+ columns to request from the DataAPI
-_OHLC_EXTRA_COLUMNS = ["volume", "buy_vol", "sell_vol", "open_interest"]
+_OHLC_EXTRA_COLUMNS = [
+    "volume", "buy_vol", "sell_vol",
+    "open_time", "high_time", "low_time", "close_time",
+    "open_interest",
+]


 async def backtest_strategy(
@@ -153,11 +157,11 @@ async def backtest_strategy(

    # --- 7. Return results ---
    payload = {
-        "strategy_name":  strategy_name,
-        "feeds": [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
+        "strategy_name":   strategy_name,
+        "feeds":           [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
        "initial_capital": initial_capital,
        "paper":           paper,
        "total_candles":   total_candles,
-        **metrics,
+        **metrics,  # keys: summary, statistics, trades, equity_curve
    }
    return [TextContent(type="text", text=json.dumps(payload))]
--- a/sandbox/dexorder/tools/indicator_harness.py
+++ b/sandbox/dexorder/tools/indicator_harness.py
@@ -2,9 +2,10 @@
 """
 Indicator harness — tests a custom indicator against synthetic OHLC data.

-Runs in a subprocess so the indicator code is isolated from the MCP server process.
+Can be called in-process (preferred) via run() or as a subprocess for backward
+compatibility.

-Usage: python indicator_harness.py <impl_path> <metadata_path>
+Usage (subprocess): python indicator_harness.py <impl_path> <metadata_path>

 Outputs JSON to stdout:
 {
@@ -21,7 +22,7 @@ import traceback
 import types
 from pathlib import Path

-# Ensure dexorder package is importable (same as research_harness.py)
+# Ensure dexorder package is importable when run as a subprocess
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))


@@ -84,13 +85,15 @@ def summarize(result, n: int) -> str:
        return f"Unexpected return type: {type(result).__name__}"


-def main():
-    if len(sys.argv) < 3:
-        print(json.dumps({"success": False, "error": "Usage: indicator_harness.py <impl_path> <metadata_path>"}))
-        sys.exit(1)
+def run(impl_path: Path, metadata_path: Path) -> dict:
+    """
+    Run an indicator against synthetic OHLC data and return results.

-    impl_path = sys.argv[1]
-    metadata_path = sys.argv[2]
+    Returns:
+        dict with success, output, error fields
+    """
+    impl_path = Path(impl_path)
+    metadata_path = Path(metadata_path)

    # --- Load metadata ---
    input_series = ["close"]
@@ -107,34 +110,32 @@ def main():
                # bare value (legacy)
                parameters[pname] = pinfo
    except Exception as e:
-        print(json.dumps({"success": False, "error": f"Failed to read metadata: {e}"}))
-        sys.exit(0)
+        return {"success": False, "error": f"Failed to read metadata: {e}"}

    # --- Generate synthetic data ---
    try:
        import numpy  # noqa: F401 — verify numpy available
        import pandas as pd
    except ImportError as e:
-        print(json.dumps({"success": False, "error": f"Missing required package: {e}"}))
-        sys.exit(0)
+        return {"success": False, "error": f"Missing required package: {e}"}

    df = make_synthetic_ohlcv(n=200)
    n = len(df)

    # --- Load implementation ---
+    # Clear from sys.modules first so edits are picked up
+    module_name = f"_dexorder_indicator_{impl_path.parent.name}"
+    sys.modules.pop(module_name, None)
    try:
-        spec = importlib.util.spec_from_file_location("_indicator_impl", impl_path)
+        spec = importlib.util.spec_from_file_location(module_name, impl_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)  # type: ignore[union-attr]
    except Exception:
        tb = traceback.format_exc()
-        print(json.dumps({"success": False, "error": f"Import failed:\n{tb}"}))
-        sys.exit(0)
+        return {"success": False, "error": f"Import failed:\n{tb}"}

    # --- Find the indicator function ---
-    # Prefer a function whose name matches the sanitized directory name,
-    # fall back to the first public function in the module.
-    fn_name = os.path.basename(os.path.dirname(impl_path)).lower()
+    fn_name = impl_path.parent.name.lower()
    fn = getattr(module, fn_name, None)
    if fn is None:
        candidates = [
@@ -144,15 +145,13 @@ def main():
        fn = candidates[0] if candidates else None

    if fn is None:
-        print(json.dumps({"success": False, "error": "No callable function found in implementation.py"}))
-        sys.exit(0)
+        return {"success": False, "error": "No callable function found in implementation.py"}

    # --- Build positional args from input_series ---
    args = []
    for col in input_series:
        if col not in df.columns:
-            print(json.dumps({"success": False, "error": f"input_series '{col}' not in synthetic df columns {list(df.columns)}"}))
-            sys.exit(0)
+            return {"success": False, "error": f"input_series '{col}' not in synthetic df columns {list(df.columns)}"}
        args.append(df[col])

    # --- Execute ---
@@ -160,22 +159,29 @@ def main():
        result = fn(*args, **parameters)
    except Exception:
        tb = traceback.format_exc()
-        print(json.dumps({"success": False, "error": f"Execution failed:\n{tb}"}))
-        sys.exit(0)
+        return {"success": False, "error": f"Execution failed:\n{tb}"}

    # --- Validate output type ---
    if not isinstance(result, (pd.Series, pd.DataFrame)):
-        print(json.dumps({
+        return {
            "success": False,
            "error": (
                f"Indicator must return pd.Series or pd.DataFrame, "
                f"got {type(result).__name__}. "
                "Wrap the output if using pandas-ta internally."
            ),
-        }))
-        sys.exit(0)
+        }

-    print(json.dumps({"success": True, "output": summarize(result, n)}))
+    return {"success": True, "output": summarize(result, n)}
+
+
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"success": False, "error": "Usage: indicator_harness.py <impl_path> <metadata_path>"}))
+        sys.exit(1)
+
+    result = run(Path(sys.argv[1]), Path(sys.argv[2]))
+    print(json.dumps(result))


 if __name__ == "__main__":
--- a/sandbox/dexorder/tools/python_tools.py
+++ b/sandbox/dexorder/tools/python_tools.py
@@ -18,11 +18,13 @@ After write/edit operations, a category-specific test harness runs to validate
 the code and capture errors/output for agent feedback.
 """

+import concurrent.futures
 import json
 import logging
 import re
 import subprocess
 import sys
+import traceback
 from dataclasses import dataclass, asdict
 from enum import Enum
 from pathlib import Path
@@ -30,16 +32,37 @@ from typing import Any, Optional

 log = logging.getLogger(__name__)

-# Path to the harness scripts (written to disk, not inline)
-_RESEARCH_HARNESS = Path(__file__).parent / "research_harness.py"
-_INDICATOR_HARNESS = Path(__file__).parent / "indicator_harness.py"

-# Import conda manager for package installation
+def _run_inprocess(fn, *args, timeout: int) -> dict:
+    """
+    Run fn(*args) in a one-shot thread and return its result dict.
+
+    Uses a thread so the calling coroutine is not blocked and the calling
+    process does not fork a new Python interpreter. All already-loaded
+    libraries (numpy, pandas, matplotlib, etc.) are shared with the thread.
+
+    On timeout returns a dict with _timeout=True. On unexpected exception
+    returns a dict with error=True and the traceback in stderr.
+    """
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(fn, *args)
+        try:
+            return future.result(timeout=timeout)
+        except concurrent.futures.TimeoutError:
+            return {"_timeout": True, "error": True,
+                    "stdout": "", "stderr": "", "images": []}
+        except Exception:
+            return {"error": True, "stdout": "",
+                    "stderr": traceback.format_exc(), "images": []}
+
+
+# Import conda manager for package installation and tracking
 try:
-    from dexorder.conda_manager import install_packages
+    from dexorder.conda_manager import install_packages, cleanup_extra_packages
 except ImportError:
    log.warning("conda_manager not available - package installation disabled")
    install_packages = None
+    cleanup_extra_packages = None


 # =============================================================================
@@ -289,6 +312,49 @@ class GitManager:
        except subprocess.CalledProcessError as e:
            raise RuntimeError(e.stderr.strip()) from e

+    def head_short_hash(self) -> str:
+        """Return the short hash of HEAD, or 'unknown' on error."""
+        try:
+            result = self._run("rev-parse", "--short", "HEAD")
+            return result.stdout.strip()
+        except Exception:
+            return "unknown"
+
+    def create_worktree(self, worktree_path: Path, revision: str = "HEAD") -> str:
+        """
+        Create a git worktree at worktree_path pinned to revision.
+
+        Returns the short hash of the checked-out commit.
+        """
+        worktree_path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            self._run("worktree", "add", "--detach", str(worktree_path), revision)
+            # Get short hash of the worktree's HEAD
+            result = subprocess.run(
+                ["git", "rev-parse", "--short", "HEAD"],
+                cwd=str(worktree_path),
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            return result.stdout.strip()
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"git worktree add failed: {e.stderr.strip()}") from e
+
+    def remove_worktree(self, worktree_path: Path) -> None:
+        """Remove a git worktree, silently ignoring errors if it no longer exists."""
+        try:
+            self._run("worktree", "remove", "--force", str(worktree_path), check=False)
+        except Exception as e:
+            log.warning("git worktree remove failed (non-fatal): %s", e)
+
+    def prune_worktrees(self) -> None:
+        """Prune stale worktree references."""
+        try:
+            self._run("worktree", "prune", check=False)
+        except Exception:
+            pass
+

 # =============================================================================
 # Custom Indicator Setup
@@ -733,7 +799,7 @@ class CategoryFileManager:
                conda_packages = metadata.get("conda_packages", [])
                if conda_packages:
                    log.info(f"Installing packages for validation: {conda_packages}")
-                    install_result = install_packages(conda_packages)
+                    install_result = install_packages(conda_packages, data_dir=self.data_dir)
                    if install_result.get("success"):
                        packages_installed = install_result.get("installed", [])
                        if packages_installed:
@@ -761,48 +827,49 @@ class CategoryFileManager:

    def _validate_strategy(self, impl_path: Path) -> dict[str, Any]:
        """
-        Validate a strategy implementation.
+        Validate a strategy by running it against synthetic OHLC data.

-        Runs basic syntax check and imports.
+        Runs strategy_harness.py in-process via a thread. Catches import errors,
+        runtime errors in evaluate(), and wrong class hierarchy — not just syntax.
        """
-        try:
-            result = subprocess.run(
-                [sys.executable, "-m", "py_compile", str(impl_path)],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
+        meta_path = impl_path.parent / "metadata.json"
+        return self._execute_strategy(impl_path.parent, timeout=45)

-            if result.returncode == 0:
-                return {
-                    "success": True,
-                    "output": "Strategy syntax valid",
-                }
-            else:
-                return {
-                    "success": False,
-                    "output": result.stderr,
-                    "error": "Syntax error in strategy",
-                }
-        except subprocess.TimeoutExpired:
-            return {"success": False, "error": "Validation timeout"}
-        except Exception as e:
-            return {"success": False, "error": f"Validation failed: {e}"}
+    def _execute_strategy(self, item_dir: Path, timeout: int = 45) -> dict[str, Any]:
+        """
+        Run a strategy against synthetic OHLC data in-process via a thread.
+
+        Returns:
+            dict with success, output (human-readable summary), trade_count, error
+        """
+        impl_path = item_dir / "implementation.py"
+        meta_path = item_dir / "metadata.json"
+
+        if not impl_path.exists():
+            return {"success": False, "error": "implementation.py not found"}
+        if not meta_path.exists():
+            return {"success": False, "error": "metadata.json not found"}
+
+        from dexorder.tools.strategy_harness import run as _strategy_run
+        result = _run_inprocess(_strategy_run, impl_path, meta_path, timeout=timeout)
+
+        if result.get("_timeout"):
+            return {"success": False, "error": f"Strategy test timed out after {timeout}s"}
+        return result

    def _validate_indicator(self, impl_path: Path) -> dict[str, Any]:
        """
        Validate an indicator by running it against synthetic OHLC data.

-        Uses indicator_harness.py in a subprocess so the indicator code is
-        isolated from the MCP server process. Catches import errors, runtime
-        errors, and wrong return types — not just syntax.
+        Runs indicator_harness.py in-process via a thread. Catches import errors,
+        runtime errors, and wrong return types — not just syntax.
        """
        meta_path = impl_path.parent / "metadata.json"
        return self._execute_indicator(impl_path.parent, timeout=30)

    def _execute_indicator(self, item_dir: Path, timeout: int = 30) -> dict[str, Any]:
        """
-        Run an indicator against synthetic OHLC data via indicator_harness.py.
+        Run an indicator against synthetic OHLC data in-process via a thread.

        Returns:
            dict with success, output (human-readable summary), error
@@ -815,77 +882,22 @@ class CategoryFileManager:
        if not meta_path.exists():
            return {"success": False, "error": "metadata.json not found"}

-        try:
-            result = subprocess.run(
-                [sys.executable, str(_INDICATOR_HARNESS), str(impl_path), str(meta_path)],
-                capture_output=True,
-                text=True,
-                timeout=timeout,
-                cwd=str(item_dir),
-            )
-        except subprocess.TimeoutExpired:
+        from dexorder.tools.indicator_harness import run as _indicator_run
+        result = _run_inprocess(_indicator_run, impl_path, meta_path, timeout=timeout)
+
+        if result.get("_timeout"):
            return {"success": False, "error": f"Indicator test timed out after {timeout}s"}
-        except Exception as e:
-            return {"success": False, "error": f"Harness launch failed: {e}"}
+        return result

-        if result.returncode != 0:
-            return {
-                "success": False,
-                "error": f"Harness process failed:\n{result.stderr}",
-            }
-
-        try:
-            data = json.loads(result.stdout)
-        except json.JSONDecodeError:
-            return {
-                "success": False,
-                "error": f"Harness produced invalid JSON:\n{result.stdout[:500]}",
-            }
-
-        return data
-
-    def _run_research_harness(self, impl_path: Path, item_dir: Path, timeout: int = 30) -> dict[str, Any]:
+    def _run_research_harness(self, impl_path: Path, item_dir: Path, timeout: int = 300) -> dict[str, Any]:
        """
-        Run a research script via the on-disk harness and return parsed results.
-
-        The harness (research_harness.py) handles API initialization, stdout/stderr
-        capture, matplotlib figure capture, and outputs JSON to stdout.
+        Run a research script in-process via a thread and return captured results.

        Returns:
            dict with stdout, stderr, images, error fields — or an error dict.
        """
-        try:
-            result = subprocess.run(
-                [sys.executable, str(_RESEARCH_HARNESS), str(impl_path)],
-                capture_output=True,
-                text=True,
-                timeout=timeout,
-                cwd=str(item_dir),
-            )
-
-            if result.returncode == 0:
-                try:
-                    return json.loads(result.stdout)
-                except json.JSONDecodeError:
-                    return {
-                        "stdout": result.stdout,
-                        "stderr": result.stderr,
-                        "images": [],
-                        "error": True,
-                    }
-            else:
-                # Harness itself failed (import error, bad args, etc.)
-                return {
-                    "stdout": "",
-                    "stderr": result.stderr,
-                    "images": [],
-                    "error": True,
-                }
-        except subprocess.TimeoutExpired:
-            return {"stdout": "", "stderr": "", "images": [], "error": True,
-                    "_timeout": True}
-        except Exception as e:
-            return {"stdout": "", "stderr": str(e), "images": [], "error": True}
+        from dexorder.tools.research_harness import run as _research_run
+        return _run_inprocess(_research_run, impl_path, item_dir, timeout=timeout)

    def _validate_research(self, impl_path: Path, item_dir: Path) -> dict[str, Any]:
        """
@@ -893,7 +905,7 @@ class CategoryFileManager:

        Runs the script via the harness and captures output + pyplot images.
        """
-        data = self._run_research_harness(impl_path, item_dir, timeout=30)
+        data = self._run_research_harness(impl_path, item_dir, timeout=300)

        if data.get("_timeout"):
            return {"success": False, "error": "Research script timeout"}
@@ -983,6 +995,48 @@ class CategoryFileManager:
        return {"content": content}


+    def delete(self, category: str, name: str) -> dict[str, Any]:
+        """
+        Delete a category script directory and commit the removal to git.
+
+        Args:
+            category: Category name (strategy, indicator, research)
+            name: Display name of the item to delete
+
+        Returns:
+            dict with:
+            - success: bool
+            - category: str
+            - name: str
+            - revision: str - git commit hash of the deletion commit
+            - error: str (if any)
+        """
+        import shutil
+
+        try:
+            cat = Category(category)
+        except ValueError:
+            return {
+                "success": False,
+                "error": f"Invalid category '{category}'. Must be one of: {', '.join(c.value for c in Category)}"
+            }
+
+        item_dir = get_category_path(self.src_dir, cat, name)
+        if not item_dir.exists():
+            return {"success": False, "error": f"{category} '{name}' not found"}
+
+        try:
+            shutil.rmtree(item_dir)
+            log.info(f"Deleted {cat.value}: {item_dir}")
+        except Exception as e:
+            return {"success": False, "error": f"Failed to delete: {e}"}
+
+        commit_hash = self.git.commit(f"delete({category}): {name}")
+        result: dict[str, Any] = {"success": True, "category": category, "name": name}
+        if commit_hash:
+            result["revision"] = commit_hash
+        return result
+
    def git_log(
        self,
        category: Optional[str] = None,
--- a/sandbox/dexorder/tools/research_harness.py
+++ b/sandbox/dexorder/tools/research_harness.py
@@ -1,13 +1,12 @@
 #!/usr/bin/env python3
 """
-Research script harness - runs implementation.py in a subprocess with API
-initialization, stdout/stderr capture, and matplotlib figure capture.
+Research script harness - runs implementation.py with API initialization,
+stdout/stderr capture, and matplotlib figure capture.

-This file is written to disk and invoked by python_tools.py rather than
-being passed inline via `python -c`, so the harness code is inspectable and
-not regenerated on every call.
+Can be called in-process (preferred) via run() or as a subprocess for backward
+compatibility.

-Usage:
+Usage (subprocess):
    python -m dexorder.tools.research_harness <implementation_path>

 Output (JSON to stdout):
@@ -19,73 +18,148 @@ Output (JSON to stdout):
    }
 """

-import sys
 import io
 import os
 import base64
 import json
+import sys
+import traceback
 from pathlib import Path

-# Non-interactive matplotlib backend (must be set before importing pyplot)
+# Non-interactive matplotlib backend (must be set before importing pyplot).
+# Idempotent — safe to call multiple times.
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt

-# Ensure dexorder package is importable
+# Ensure dexorder package is importable when run as a subprocess
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))

-# ---------------------------------------------------------------------------
-# Initialize API from config files so research scripts can call get_api()
-# ---------------------------------------------------------------------------
-try:
-    import yaml

-    config_path = os.environ.get("CONFIG_PATH", "/app/config/config.yaml")
-    secrets_path = os.environ.get("SECRETS_PATH", "/app/config/secrets.yaml")
+def run(impl_path: Path, item_dir: Path) -> dict:
+    """
+    Run a research script in-process and return captured results.

-    config_data = {}
-    secrets_data = {}
-    if Path(config_path).exists():
-        with open(config_path) as f:
-            config_data = yaml.safe_load(f) or {}
-    if Path(secrets_path).exists():
-        with open(secrets_path) as f:
-            secrets_data = yaml.safe_load(f) or {}
+    Creates a fresh DataAPIImpl per call (thread-safe: API stored in thread-local
+    via set_api() so the global API is not overwritten).

-    data_cfg = config_data.get("data", {})
-    iceberg_cfg = data_cfg.get("iceberg", {})
-    relay_cfg = data_cfg.get("relay", {})
+    Returns:
+        dict with stdout, stderr, images, error fields
+    """
+    impl_path = Path(impl_path)

-    from dexorder.api import set_api, API
-    from dexorder.impl.charting_api_impl import ChartingAPIImpl
-    from dexorder.impl.data_api_impl import DataAPIImpl
+    if not impl_path.exists():
+        return {
+            "stdout": "",
+            "stderr": f"Implementation file not found: {impl_path}",
+            "images": [],
+            "error": True,
+        }

-    _data_api = DataAPIImpl(
-        iceberg_catalog_uri=iceberg_cfg.get("catalog_uri", "http://iceberg-catalog:8181"),
-        relay_endpoint=relay_cfg.get("endpoint", "tcp://relay:5559"),
-        notification_endpoint=relay_cfg.get("notification_endpoint", "tcp://relay:5558"),
-        namespace=iceberg_cfg.get("namespace", "trading"),
-        s3_endpoint=iceberg_cfg.get("s3_endpoint") or secrets_data.get("s3_endpoint"),
-        s3_access_key=iceberg_cfg.get("s3_access_key") or secrets_data.get("s3_access_key"),
-        s3_secret_key=iceberg_cfg.get("s3_secret_key") or secrets_data.get("s3_secret_key"),
-    )
-    # NOTE: We intentionally do NOT call asyncio.run(_data_api.start()) here.
-    # DataAPIImpl.historical_ohlc() auto-starts on first use, which ensures the
-    # ZMQ context and notification listener are created inside the user's own
-    # asyncio.run() event loop — avoiding cross-loop lifecycle issues.
-    set_api(API(charting=ChartingAPIImpl(), data=_data_api))
-except Exception as e:
-    print(f"WARNING: API initialization failed: {e}", file=sys.stderr)
+    # ---------------------------------------------------------------------------
+    # Initialize a fresh API instance for this execution (thread-local)
+    # ---------------------------------------------------------------------------
+    try:
+        import yaml

-# ---------------------------------------------------------------------------
-# Register custom indicators so research scripts can use df.ta.my_indicator()
-# ---------------------------------------------------------------------------
-try:
-    from dexorder.tools.python_tools import setup_custom_indicators
-    _data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
-    setup_custom_indicators(_data_dir)
-except Exception as e:
-    print(f"WARNING: Custom indicator registration failed: {e}", file=sys.stderr)
+        config_path = os.environ.get("CONFIG_PATH", "/app/config/config.yaml")
+        secrets_path = os.environ.get("SECRETS_PATH", "/app/config/secrets.yaml")
+
+        config_data = {}
+        secrets_data = {}
+        if Path(config_path).exists():
+            with open(config_path) as f:
+                config_data = yaml.safe_load(f) or {}
+        if Path(secrets_path).exists():
+            with open(secrets_path) as f:
+                secrets_data = yaml.safe_load(f) or {}
+
+        data_cfg = config_data.get("data", {})
+        iceberg_cfg = data_cfg.get("iceberg", {})
+        relay_cfg = data_cfg.get("relay", {})
+
+        from dexorder.api import set_api, API
+        from dexorder.impl.charting_api_impl import ChartingAPIImpl
+        from dexorder.impl.data_api_impl import DataAPIImpl
+
+        _data_api = DataAPIImpl(
+            iceberg_catalog_uri=iceberg_cfg.get("catalog_uri", "http://iceberg-catalog:8181"),
+            relay_endpoint=relay_cfg.get("endpoint", "tcp://relay:5559"),
+            notification_endpoint=relay_cfg.get("notification_endpoint", "tcp://relay:5558"),
+            namespace=iceberg_cfg.get("namespace", "trading"),
+            s3_endpoint=iceberg_cfg.get("s3_endpoint") or secrets_data.get("s3_endpoint"),
+            s3_access_key=iceberg_cfg.get("s3_access_key") or secrets_data.get("s3_access_key"),
+            s3_secret_key=iceberg_cfg.get("s3_secret_key") or secrets_data.get("s3_secret_key"),
+            s3_region=iceberg_cfg.get("s3_region") or secrets_data.get("s3_region"),
+            request_timeout=240.0,
+        )
+        # NOTE: We intentionally do NOT call asyncio.run(_data_api.start()) here.
+        # DataAPIImpl.historical_ohlc() auto-starts on first use, which ensures the
+        # ZMQ context and notification listener are created inside the user's own
+        # asyncio.run() event loop — avoiding cross-loop lifecycle issues.
+        # In a harness thread, set_api() stores to thread-local (not the global).
+        set_api(API(charting=ChartingAPIImpl(), data=_data_api))
+    except Exception as e:
+        # Non-fatal — script may not use the API
+        sys.stderr.write(f"WARNING: API initialization failed: {e}\n")
+
+    # ---------------------------------------------------------------------------
+    # Register custom indicators
+    # ---------------------------------------------------------------------------
+    try:
+        from dexorder.tools.python_tools import setup_custom_indicators
+        _data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
+        setup_custom_indicators(_data_dir)
+    except Exception as e:
+        sys.stderr.write(f"WARNING: Custom indicator registration failed: {e}\n")
+
+    # ---------------------------------------------------------------------------
+    # Execute user script with captured stdout/stderr
+    # ---------------------------------------------------------------------------
+    stdout_buf = io.StringIO()
+    stderr_buf = io.StringIO()
+
+    error_occurred = False
+    old_stdout, old_stderr = sys.stdout, sys.stderr
+    old_cwd = os.getcwd()
+    sys.stdout = stdout_buf
+    sys.stderr = stderr_buf
+
+    try:
+        os.chdir(impl_path.parent)
+        exec(compile(impl_path.read_text(), str(impl_path), 'exec'), {})
+    except Exception as e:
+        print(f"ERROR: {e}", file=sys.stderr)
+        traceback.print_exc(file=sys.stderr)
+        error_occurred = True
+    finally:
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr
+        os.chdir(old_cwd)
+
+    stdout_output = stdout_buf.getvalue()
+    stderr_output = stderr_buf.getvalue()
+
+    # ---------------------------------------------------------------------------
+    # Capture matplotlib figures
+    # ---------------------------------------------------------------------------
+    images = []
+    if not error_occurred:
+        for fig_num in plt.get_fignums():
+            fig = plt.figure(fig_num)
+            buf = io.BytesIO()
+            fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
+            buf.seek(0)
+            images.append({"format": "png", "data": base64.b64encode(buf.read()).decode('utf-8')})
+            buf.close()
+    plt.close('all')
+
+    return {
+        "stdout": stdout_output,
+        "stderr": stderr_output,
+        "images": images,
+        "error": error_occurred,
+    }


 def main():
@@ -94,55 +168,8 @@ def main():
        sys.exit(2)

    impl_path = Path(sys.argv[1])
-    if not impl_path.exists():
-        print(json.dumps({
-            "stdout": "",
-            "stderr": f"Implementation file not found: {impl_path}",
-            "images": [],
-            "error": True,
-        }))
-        sys.exit(0)
-
-    # Capture stdout and stderr
-    old_stdout = sys.stdout
-    old_stderr = sys.stderr
-    sys.stdout = io.StringIO()
-    sys.stderr = io.StringIO()
-
-    error_occurred = False
-    try:
-        exec(compile(impl_path.read_text(), str(impl_path), 'exec'), {})
-    except Exception as e:
-        print(f"ERROR: {e}", file=sys.stderr)
-        import traceback
-        traceback.print_exc(file=sys.stderr)
-        error_occurred = True
-
-    # Restore stdout/stderr
-    stdout_output = sys.stdout.getvalue()
-    stderr_output = sys.stderr.getvalue()
-    sys.stdout = old_stdout
-    sys.stderr = old_stderr
-
-    # Capture all matplotlib figures as base64 PNGs
-    images = []
-    for fig_num in plt.get_fignums():
-        fig = plt.figure(fig_num)
-        buf = io.BytesIO()
-        fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
-        buf.seek(0)
-        img_b64 = base64.b64encode(buf.read()).decode('utf-8')
-        images.append({"format": "png", "data": img_b64})
-        buf.close()
-    plt.close('all')
-
-    # Output results as JSON to real stdout
-    result = {
-        "stdout": stdout_output,
-        "stderr": stderr_output,
-        "images": images,
-        "error": error_occurred,
-    }
+    item_dir = impl_path.parent
+    result = run(impl_path, item_dir)
    print(json.dumps(result))


--- a/sandbox/dexorder/tools/strategy_harness.py
+++ b/sandbox/dexorder/tools/strategy_harness.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+Strategy harness — validates a PandasStrategy against synthetic OHLC data.
+
+Can be called in-process (preferred) via run() or as a subprocess for backward
+compatibility.
+
+Usage (subprocess): python strategy_harness.py <impl_path> <metadata_path>
+
+Outputs JSON to stdout:
+{
+    "success": bool,
+    "output": str,       # human-readable summary on success
+    "trade_count": int,  # number of trades executed in the mini-backtest
+    "error": str | null  # error message / traceback if failed
+}
+"""
+import json
+import os
+import sys
+import traceback
+from pathlib import Path
+
+# Ensure dexorder package is importable when run as a subprocess
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+
+# ---------------------------------------------------------------------------
+# Synthetic OHLCV data — 100 deterministic bars, no network required
+# ---------------------------------------------------------------------------
+
+def make_synthetic_ohlcv(n: int = 100):
+    import numpy as np
+    import pandas as pd
+
+    rng = np.random.default_rng(42)
+    returns = rng.normal(0, 0.015, n)
+    closes = 40_000.0 * np.cumprod(1.0 + returns)
+
+    opens = np.empty(n)
+    opens[0] = closes[0]
+    opens[1:] = closes[:-1]
+
+    noise = np.abs(rng.normal(0, 0.005, n))
+    highs = np.maximum(opens, closes) * (1.0 + noise)
+    lows  = np.minimum(opens, closes) * (1.0 - noise)
+    volumes = rng.uniform(1e6, 1e8, n)
+    buy_vols = volumes * rng.uniform(0.4, 0.6, n)
+
+    now_ns = 1_700_000_000_000_000_000  # arbitrary epoch in nanoseconds
+    step_ns = 3_600_000_000_000         # 1 hour in nanoseconds
+    timestamps = [now_ns + i * step_ns for i in range(n)]
+
+    return pd.DataFrame({
+        "timestamp": timestamps,
+        "open":      opens,
+        "high":      highs,
+        "low":       lows,
+        "close":     closes,
+        "volume":    volumes,
+        "buy_vol":   buy_vols,
+        "sell_vol":  volumes - buy_vols,
+        "open_interest": rng.uniform(1e8, 1e9, n),
+    })
+
+
+def run(impl_path: Path, metadata_path: Path) -> dict:
+    """
+    Validate a strategy against synthetic OHLC data and return results.
+
+    Returns:
+        dict with success, output, trade_count, error fields
+    """
+    impl_path = Path(impl_path)
+    metadata_path = Path(metadata_path)
+
+    # --- Load metadata (feeds, parameters) ---
+    data_feeds: list[dict] = []
+    parameters: dict = {}
+    try:
+        with open(metadata_path) as f:
+            meta = json.load(f)
+        data_feeds = meta.get("data_feeds") or []
+        param_schema = meta.get("parameters") or {}
+        for pname, pinfo in param_schema.items():
+            if isinstance(pinfo, dict) and "default" in pinfo:
+                parameters[pname] = pinfo["default"]
+            elif not isinstance(pinfo, dict):
+                parameters[pname] = pinfo
+    except Exception as e:
+        return {"success": False, "output": "", "trade_count": 0, "error": f"Failed to read metadata: {e}"}
+
+    # --- Build synthetic feed keys ---
+    if data_feeds:
+        feed_configs = [(f.get("symbol", "BTC/USDT.SYNTH"), int(f.get("period_seconds", 3600)))
+                        for f in data_feeds]
+    else:
+        feed_configs = [("BTC/USDT.SYNTH", 3600)]
+
+    # --- Register custom indicators ---
+    try:
+        from dexorder.tools.python_tools import setup_custom_indicators
+        data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
+        setup_custom_indicators(data_dir)
+    except Exception:
+        pass
+
+    # --- Load strategy class ---
+    try:
+        from dexorder.nautilus.backtest_runner import _load_strategy_class
+        strategy_class = _load_strategy_class(impl_path)
+    except Exception:
+        tb = traceback.format_exc()
+        return {"success": False, "output": "", "trade_count": 0, "error": f"Strategy load failed:\n{tb}"}
+
+    # --- Run a minimal backtest with synthetic data ---
+    try:
+        import pandas as pd
+        from dexorder.nautilus.pandas_strategy import PandasStrategyConfig, make_feed_key
+        from dexorder.nautilus.backtest_runner import _setup_custom_indicators
+
+        try:
+            data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
+            _setup_custom_indicators(data_dir)
+        except Exception:
+            pass
+
+        # Build one synthetic DataFrame per feed
+        feed_dfs: dict[str, pd.DataFrame] = {}
+        for ticker, period_seconds in feed_configs:
+            fk = make_feed_key(ticker, period_seconds)
+            feed_dfs[fk] = make_synthetic_ohlcv(100)
+
+        feed_keys = tuple(make_feed_key(t, p) for t, p in feed_configs)
+        config = PandasStrategyConfig(
+            strategy_id=f"{strategy_class.__name__}-HARNESS",
+            feed_keys=feed_keys,
+            initial_capital=10_000.0,
+        )
+
+        strat = strategy_class(config=config)
+
+        for pname, pval in parameters.items():
+            if hasattr(strat, pname):
+                setattr(strat, pname, pval)
+
+        # Replay bars: accumulate rows and call evaluate()
+        buy_count = 0
+        sell_count = 0
+        evaluate_errors: list[str] = []
+        rows_by_feed: dict[str, list] = {fk: [] for fk in feed_keys}
+
+        for i in range(len(next(iter(feed_dfs.values())))):
+            for fk, df in feed_dfs.items():
+                row = df.iloc[i].to_dict()
+                rows_by_feed[fk].append(row)
+                current_dfs = {k: pd.DataFrame(v) for k, v in rows_by_feed.items()}
+
+                _orig_buy = strat.buy
+                _orig_sell = strat.sell
+                _orig_flatten = strat.flatten
+
+                class _BuyCounter:
+                    def __call__(inner_self, *a, **kw):
+                        nonlocal buy_count
+                        buy_count += 1
+
+                class _SellCounter:
+                    def __call__(inner_self, *a, **kw):
+                        nonlocal sell_count
+                        sell_count += 1
+
+                strat.buy = _BuyCounter()
+                strat.sell = _SellCounter()
+                strat.flatten = lambda *a, **kw: None
+
+                try:
+                    strat.evaluate(current_dfs)
+                except Exception as e:
+                    evaluate_errors.append(f"Bar {i}: {e}")
+                    if len(evaluate_errors) > 3:
+                        break
+                finally:
+                    strat.buy = _orig_buy
+                    strat.sell = _orig_sell
+                    strat.flatten = _orig_flatten
+
+            if evaluate_errors and len(evaluate_errors) > 3:
+                break
+
+        if evaluate_errors:
+            return {
+                "success": False,
+                "output": "",
+                "trade_count": 0,
+                "error": "evaluate() raised errors:\n" + "\n".join(evaluate_errors[:3]),
+            }
+
+        trade_count = buy_count + sell_count
+        n_bars = len(next(iter(feed_dfs.values())))
+        n_feeds = len(feed_dfs)
+        output = (
+            f"Strategy validated OK: {n_bars} bars × {n_feeds} feed(s), "
+            f"buy_signals={buy_count}, sell_signals={sell_count}"
+        )
+        return {"success": True, "output": output, "trade_count": trade_count, "error": None}
+
+    except Exception:
+        tb = traceback.format_exc()
+        return {"success": False, "output": "", "trade_count": 0, "error": f"Harness execution failed:\n{tb}"}
+
+
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({
+            "success": False,
+            "output": "",
+            "trade_count": 0,
+            "error": "Usage: strategy_harness.py <impl_path> <metadata_path>",
+        }))
+        sys.exit(1)
+
+    result = run(Path(sys.argv[1]), Path(sys.argv[2]))
+    print(json.dumps(result))
+
+
+if __name__ == "__main__":
+    main()
--- a/sandbox/environment.yml
+++ b/sandbox/environment.yml
@@ -51,3 +51,4 @@ dependencies:
    - uvicorn>=0.27.0
    - sse-starlette>=1.6.0
    - nautilus_trader>=1.200.0
+    - aiosqlite>=0.19.0
--- a/sandbox/main.py
+++ b/sandbox/main.py
@@ -11,6 +11,7 @@ Brings together:

 import asyncio
 import contextlib
+import json
 import logging
 import os
 import signal
@@ -32,7 +33,7 @@ from starlette.routing import Route, Mount

 from dexorder import EventPublisher, start_lifecycle_manager, get_lifecycle_manager
 from dexorder.api import set_api, API
-from dexorder.conda_manager import sync_packages, install_packages
+from dexorder.conda_manager import sync_packages, install_packages, cleanup_extra_packages
 from dexorder.events import EventType, UserEvent, DeliverySpec
 from dexorder.impl.charting_api_impl import ChartingAPIImpl
 from dexorder.impl.data_api_impl import DataAPIImpl
@@ -41,6 +42,8 @@ from dexorder.tools.workspace_tools import get_workspace_store
 from dexorder.tools.evaluate_indicator import evaluate_indicator
 from dexorder.tools.backtest_strategy import backtest_strategy
 from dexorder.tools.activate_strategy import activate_strategy, deactivate_strategy, list_active_strategies
+from dexorder.strategy.event_bridge import StrategyEventBridge
+from dexorder.strategy.lifecycle import get_strategy_lifecycle

 # =============================================================================
 # Global Data Directory
@@ -59,19 +62,34 @@ def get_data_dir() -> Path:


 # =============================================================================
-# Indicator Types Helpers
+# Category Types Helpers
 # =============================================================================

-def _build_indicator_type_entry(meta: dict) -> dict:
-    """Build an indicator_types workspace entry from indicator metadata dict."""
+def _type_store_name(category: str) -> str:
+    return f"{category}_types"
+
+
+def _type_store_key(category: str, name: str) -> str:
+    sanitized = sanitize_name(name).lower()
+    return f"custom_{sanitized}" if category == "indicator" else sanitized
+
+
+def _build_type_entry(category: str, meta: dict) -> dict:
+    """Build a {category}_types workspace entry from a metadata dict."""
    name = meta.get('name', '')
-    pandas_ta_name = f"custom_{sanitize_name(name).lower()}"
+    key = _type_store_key(category, name)
    now = int(time.time())
-    return {
-        'pandas_ta_name': pandas_ta_name,
+    entry = {
+        'key': key,
        'display_name': name,
        'description': meta.get('description', ''),
-        'metadata': {
+        'metadata': {},
+        'created_at': now,
+        'modified_at': now,
+    }
+    if category == "indicator":
+        entry['pandas_ta_name'] = key
+        entry['metadata'] = {
            'display_name': name,
            'parameters': meta.get('parameters') or {},
            'input_series': meta.get('input_series') or ['close'],
@@ -79,31 +97,89 @@ def _build_indicator_type_entry(meta: dict) -> dict:
            'pane': meta.get('pane', 'separate'),
            'filled_areas': meta.get('filled_areas') or [],
            'bands': meta.get('bands') or [],
-        },
-        'created_at': now,
-        'modified_at': now,
-    }
+        }
+    elif category == "strategy":
+        entry['metadata'] = {
+            'data_feeds': meta.get('data_feeds') or [],
+            'parameters': meta.get('parameters') or {},
+        }
+    # research: metadata stays empty (no fields beyond base)
+    return entry


-def _upsert_indicator_type(workspace_store, category_manager, name: str) -> None:
-    """Read indicator metadata from disk and upsert into indicator_types workspace store."""
-    read_result = category_manager.read('indicator', name)
+def _upsert_type(workspace_store, category_manager, category: str, name: str) -> None:
+    """Read category metadata from disk and upsert into the {category}_types workspace store."""
+    read_result = category_manager.read(category, name)
    if not read_result.get('exists') or not read_result.get('metadata'):
        return
-    meta = read_result['metadata']
-    entry = _build_indicator_type_entry(meta)
-    pandas_ta_name = entry['pandas_ta_name']
+    entry = _build_type_entry(category, read_result['metadata'])
+    key = entry['key']
+    store = _type_store_name(category)

    # Preserve original created_at if already present
-    existing = workspace_store.read('indicator_types')
+    existing = workspace_store.read(store)
    existing_types = (existing.get('data') or {}).get('types') or {}
-    if pandas_ta_name in existing_types:
-        entry['created_at'] = existing_types[pandas_ta_name].get('created_at', entry['created_at'])
+    if key in existing_types:
+        entry['created_at'] = existing_types[key].get('created_at', entry['created_at'])

-    workspace_store.patch('indicator_types', [
-        {'op': 'add', 'path': f'/types/{pandas_ta_name}', 'value': entry}
-    ])
-    logging.info(f"Upserted indicator_types/{pandas_ta_name} for '{name}'")
+    workspace_store.patch(store, [{'op': 'add', 'path': f'/types/{key}', 'value': entry}])
+    logging.info(f"Upserted {store}/{key} for '{name}'")
+
+
+def _remove_type(workspace_store, category: str, name: str) -> None:
+    """Remove a category item from the {category}_types workspace store."""
+    key = _type_store_key(category, name)
+    store = _type_store_name(category)
+    try:
+        workspace_store.patch(store, [{'op': 'remove', 'path': f'/types/{key}'}])
+        logging.info(f"Removed {store}/{key} for '{name}'")
+    except Exception:
+        pass  # entry may not exist; that's fine
+    if category == "indicator":
+        _remove_indicator_instances(workspace_store, key)
+
+
+def _remove_indicator_instances(workspace_store, pandas_ta_name: str) -> None:
+    """Remove all instances of a custom indicator from the indicators workspace store."""
+    existing = workspace_store.read('indicators')
+    instances = (existing.get('data') or {}).get('indicators') or {}
+    to_remove = [inst_id for inst_id, inst in instances.items()
+                 if inst.get('pandas_ta_name') == pandas_ta_name]
+    if not to_remove:
+        return
+    patches = [{'op': 'remove', 'path': f'/indicators/{inst_id}'} for inst_id in to_remove]
+    try:
+        workspace_store.patch('indicators', patches)
+        logging.info(f"Removed {len(to_remove)} instance(s) of {pandas_ta_name} from indicators store")
+    except Exception:
+        logging.warning(f"Failed to remove indicator instances for {pandas_ta_name}", exc_info=True)
+
+
+def _populate_types_from_disk(workspace_store, category_manager, category: str) -> None:
+    """Scan existing category items and add any missing entries to the {category}_types store."""
+    store = _type_store_name(category)
+    existing = workspace_store.read(store)
+    existing_types = (existing.get('data') or {}).get('types') or {}
+
+    items = category_manager.list_items(category).get('items', [])
+    added = 0
+    for item in items:
+        item_name = item.get('name', '')
+        if not item_name:
+            continue
+        key = _type_store_key(category, item_name)
+        if key not in existing_types:
+            _upsert_type(workspace_store, category_manager, category, item_name)
+            added += 1
+
+    if added > 0:
+        logging.info(f"Populated {added} {category} type(s) from disk into {store}")
+
+
+def _get_env_yml() -> Optional[Path]:
+    """Return the path to environment.yml if it exists alongside main.py."""
+    p = Path(__file__).parent / "environment.yml"
+    return p if p.exists() else None


 def _populate_indicator_types_from_disk(workspace_store, category_manager) -> None:
@@ -226,8 +302,9 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
    category_manager = get_category_manager(config.data_dir)
    logging.info(f"Category manager initialized at {config.data_dir}")

-    # Populate indicator_types store from existing indicators on disk (migration/startup sync)
-    _populate_indicator_types_from_disk(workspace_store, category_manager)
+    # Populate {category}_types stores from existing items on disk (migration/startup sync)
+    for _cat in ("indicator", "strategy", "research"):
+        _populate_types_from_disk(workspace_store, category_manager, _cat)

    @server.list_resources()
    async def list_resources():
@@ -503,6 +580,25 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                    "required": ["revision", "category", "name"]
                }
            ),
+            Tool(
+                name="python_delete",
+                description="Delete a category script permanently. Commits removal to git history and removes any conda packages that are no longer needed.",
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "category": {
+                            "type": "string",
+                            "enum": ["strategy", "indicator", "research"],
+                            "description": "Category of the script"
+                        },
+                        "name": {
+                            "type": "string",
+                            "description": "Display name of the item to delete"
+                        }
+                    },
+                    "required": ["category", "name"]
+                }
+            ),
            Tool(
                name="conda_sync",
                description="Sync conda packages: scan all metadata, remove unused packages (excluding base environment)",
@@ -699,6 +795,77 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                    "required": []
                }
            ),
+            Tool(
+                name="get_backtest_results",
+                description=(
+                    "Retrieve stored backtest results for a strategy. "
+                    "Returns the most recent backtest runs with summary stats, "
+                    "extended statistics, trade list, and equity curve."
+                ),
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "strategy_name": {
+                            "type": "string",
+                            "description": "Display name of the strategy"
+                        },
+                        "limit": {
+                            "type": "integer",
+                            "description": "Maximum number of backtest runs to return (default 5)",
+                            "default": 5
+                        }
+                    },
+                    "required": ["strategy_name"]
+                }
+            ),
+            Tool(
+                name="get_strategy_trades",
+                description=(
+                    "Retrieve the trade log for a strategy (live/paper or backtest). "
+                    "Returns individual round-trip trades with entry/exit prices and PnL."
+                ),
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "strategy_name": {
+                            "type": "string",
+                            "description": "Display name of the strategy"
+                        },
+                        "limit": {
+                            "type": "integer",
+                            "description": "Maximum number of trades to return (default 100)",
+                            "default": 100
+                        }
+                    },
+                    "required": ["strategy_name"]
+                }
+            ),
+            Tool(
+                name="get_strategy_events",
+                description=(
+                    "Retrieve the event log for a strategy "
+                    "(PnL updates, fills, errors, status changes)."
+                ),
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "strategy_name": {
+                            "type": "string",
+                            "description": "Display name of the strategy"
+                        },
+                        "event_type": {
+                            "type": "string",
+                            "description": "Filter by event type (optional): PNL_UPDATE, ORDER_FILLED, ERROR, etc."
+                        },
+                        "limit": {
+                            "type": "integer",
+                            "description": "Maximum number of events to return (default 50)",
+                            "default": 50
+                        }
+                    },
+                    "required": ["strategy_name"]
+                }
+            ),
        ]


@@ -734,7 +901,11 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                metadata=arguments.get("metadata")
            )
            content = []
-            meta_parts = [f"success: {result['success']}", f"path: {result['path']}"]
+            meta_parts = [f"success: {result['success']}"]
+            if result.get('path'):
+                meta_parts.append(f"path: {result['path']}")
+            if result.get('error'):
+                meta_parts.append(f"error: {result['error']}")
            if result.get("revision"):
                meta_parts.append(f"revision: {result['revision']}")
            if result.get("validation") and not result["validation"].get("success"):
@@ -747,8 +918,9 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                logging.info(f"python_write '{arguments.get('name')}': returning {len(content)} items, {image_count} images")
            else:
                logging.info(f"python_write '{arguments.get('name')}': no execution result (category={arguments.get('category')})")
-            if result.get("success") and arguments.get("category") == "indicator":
-                _upsert_indicator_type(workspace_store, category_manager, arguments.get("name", ""))
+            if result.get("success"):
+                _upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
+                cleanup_extra_packages(get_data_dir(), _get_env_yml())
            return content
        elif name == "python_edit":
            result = category_manager.edit(
@@ -760,7 +932,11 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                metadata=arguments.get("metadata")
            )
            content = []
-            meta_parts = [f"success: {result['success']}", f"path: {result['path']}"]
+            meta_parts = [f"success: {result['success']}"]
+            if result.get('path'):
+                meta_parts.append(f"path: {result['path']}")
+            if result.get('error'):
+                meta_parts.append(f"error: {result['error']}")
            if result.get("revision"):
                meta_parts.append(f"revision: {result['revision']}")
            if result.get("validation") and not result["validation"].get("success"):
@@ -773,8 +949,9 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                logging.info(f"python_edit '{arguments.get('name')}': returning {len(content)} items, {image_count} images")
            else:
                logging.info(f"python_edit '{arguments.get('name')}': no execution result")
-            if result.get("success") and arguments.get("category") == "indicator":
-                _upsert_indicator_type(workspace_store, category_manager, arguments.get("name", ""))
+            if result.get("success"):
+                _upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
+                cleanup_extra_packages(get_data_dir(), _get_env_yml())
            return content
        elif name == "python_read":
            return category_manager.read(
@@ -808,13 +985,28 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                meta_parts.append(f"error: {result['error']}")
            if result.get("validation") and not result["validation"].get("success"):
                meta_parts.append(f"validation errors: {result['validation'].get('errors', [])}")
+            if result.get("success"):
+                _upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
            return [TextContent(type="text", text="\n".join(meta_parts))]
+        elif name == "python_delete":
+            result = category_manager.delete(
+                category=arguments.get("category", ""),
+                name=arguments.get("name", "")
+            )
+            if result.get("success"):
+                _remove_type(workspace_store, arguments.get("category", ""), arguments.get("name", ""))
+                cleanup_result = cleanup_extra_packages(get_data_dir(), _get_env_yml())
+                if cleanup_result.get("removed"):
+                    result["packages_removed"] = cleanup_result["removed"]
+            parts = [f"success: {result['success']}"]
+            for k in ("category", "name", "revision", "packages_removed", "error"):
+                if result.get(k):
+                    parts.append(f"{k}: {result[k]}")
+            return [TextContent(type="text", text="\n".join(parts))]
        elif name == "conda_sync":
-            # Get environment.yml path relative to main.py
-            env_yml = Path(__file__).parent / "environment.yml"
            return sync_packages(
                data_dir=get_data_dir(),
-                environment_yml=env_yml if env_yml.exists() else None
+                environment_yml=_get_env_yml()
            )
        elif name == "conda_install":
            return install_packages(arguments.get("packages", []))
@@ -837,7 +1029,7 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                parameters=arguments.get("parameters") or {},
            )
        elif name == "backtest_strategy":
-            return await backtest_strategy(
+            result = await backtest_strategy(
                strategy_name=arguments.get("strategy_name", ""),
                feeds=arguments.get("feeds", []),
                from_time=arguments.get("from_time"),
@@ -845,6 +1037,26 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                initial_capital=float(arguments.get("initial_capital", 10_000.0)),
                paper=bool(arguments.get("paper", True)),
            )
+            # Persist backtest to DB (non-fatal)
+            try:
+                payload = json.loads(result[0].text) if result and isinstance(result[0], TextContent) else {}
+                if payload and "summary" in payload:
+                    from dexorder.strategy.db import get_strategy_db
+                    db = get_strategy_db(get_data_dir())
+                    await db.insert_backtest(
+                        strategy_name=arguments.get("strategy_name", ""),
+                        from_time=arguments.get("from_time"),
+                        to_time=arguments.get("to_time"),
+                        initial_capital=float(arguments.get("initial_capital", 10_000.0)),
+                        feeds=arguments.get("feeds", []),
+                        summary=payload.get("summary", {}),
+                        statistics=payload.get("statistics", {}),
+                        trades=payload.get("trades", []),
+                        equity_curve=payload.get("equity_curve", []),
+                    )
+            except Exception as _e:
+                logging.debug("Failed to persist backtest results: %s", _e)
+            return result
        elif name == "activate_strategy":
            return await activate_strategy(
                strategy_name=arguments.get("strategy_name", ""),
@@ -858,6 +1070,31 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
            )
        elif name == "list_active_strategies":
            return await list_active_strategies()
+        elif name == "get_backtest_results":
+            from dexorder.strategy.db import get_strategy_db
+            db = get_strategy_db(get_data_dir())
+            results = await db.get_backtests(
+                strategy_name=arguments.get("strategy_name", ""),
+                limit=int(arguments.get("limit", 5)),
+            )
+            return [TextContent(type="text", text=json.dumps({"backtest_runs": results}))]
+        elif name == "get_strategy_trades":
+            from dexorder.strategy.db import get_strategy_db
+            db = get_strategy_db(get_data_dir())
+            trades = await db.get_trades(
+                strategy_name=arguments.get("strategy_name", ""),
+                limit=int(arguments.get("limit", 100)),
+            )
+            return [TextContent(type="text", text=json.dumps({"trades": trades}))]
+        elif name == "get_strategy_events":
+            from dexorder.strategy.db import get_strategy_db
+            db = get_strategy_db(get_data_dir())
+            events = await db.get_events(
+                strategy_name=arguments.get("strategy_name", ""),
+                event_type=arguments.get("event_type"),
+                limit=int(arguments.get("limit", 50)),
+            )
+            return [TextContent(type="text", text=json.dumps({"events": events}))]
        else:
            raise ValueError(f"Unknown tool: {name}")

@@ -909,6 +1146,7 @@ class UserContainer:
        self.event_publisher: Optional[EventPublisher] = None
        self.mcp_server: Optional[Server] = None
        self.data_api: Optional[DataAPIImpl] = None
+        self.event_bridge: Optional[StrategyEventBridge] = None
        self.running = False

    async def start(self) -> None:
@@ -933,6 +1171,7 @@ class UserContainer:
            s3_endpoint=s3_cfg.get("s3_endpoint") or secrets.get("s3_endpoint"),
            s3_access_key=s3_cfg.get("s3_access_key") or secrets.get("s3_access_key"),
            s3_secret_key=s3_cfg.get("s3_secret_key") or secrets.get("s3_secret_key"),
+            s3_region=s3_cfg.get("s3_region") or secrets.get("s3_region"),
        )
        await self.data_api.start()
        set_api(API(charting=ChartingAPIImpl(), data=self.data_api))
@@ -965,6 +1204,23 @@ class UserContainer:
            delivery=DeliverySpec.active_or_telegram(),
        ))

+        # Initialize strategy lifecycle manager (sets up DB + worktrees dir)
+        strategy_lifecycle = get_strategy_lifecycle(self.config.data_dir)
+        await strategy_lifecycle.initialize()
+
+        # Start strategy event bridge (PULL socket for subprocess events)
+        self.event_bridge = StrategyEventBridge(
+            event_publisher=self.event_publisher,
+            strategy_lifecycle=strategy_lifecycle,
+        )
+        await self.event_bridge.start()
+        strategy_lifecycle._bridge = self.event_bridge
+        strategy_lifecycle._lifecycle = get_lifecycle_manager()
+        logging.info("Strategy event bridge started")
+
+        # Resume any strategies that were running before container restart
+        await strategy_lifecycle.resume_running()
+
        # Create MCP server
        self.mcp_server = create_mcp_server(self.config, self.event_publisher)

@@ -998,6 +1254,20 @@ class UserContainer:
                delivery=DeliverySpec.active_or_telegram(),
            ))

+        # Stop running strategies gracefully
+        try:
+            from dexorder.strategy.lifecycle import get_strategy_lifecycle
+            strategy_lifecycle = get_strategy_lifecycle()
+            await strategy_lifecycle.shutdown()
+            logging.info("Strategy lifecycle manager stopped")
+        except Exception as e:
+            logging.warning("Error stopping strategy lifecycle: %s", e)
+
+        # Stop event bridge
+        if self.event_bridge:
+            await self.event_bridge.stop()
+            logging.info("Strategy event bridge stopped")
+
        # Stop subsystems
        if self.data_api:
            await self.data_api.stop()
--- a/sandbox/setup.py
+++ b/sandbox/setup.py
@@ -1,30 +0,0 @@
-from setuptools import setup, find_packages
-
-setup(
-    name="dexorder-sandbox",
-    version="0.1.0",
-    description="Dexorder Trading Platform Sandbox",
-    packages=find_packages(),
-    python_requires=">=3.9",
-    install_requires=[
-        "pyiceberg>=0.6.0",
-        "pyarrow>=14.0.0",
-        "pandas>=2.0.0",
-        "pyzmq>=25.0.0",
-        "protobuf>=4.25.0",
-        "pyyaml>=6.0",
-        "aiofiles>=23.0.0",
-        "mcp>=1.0.0",
-        "jsonpatch>=1.33",
-        "starlette>=0.27.0",
-        "uvicorn>=0.27.0",
-        "sse-starlette>=1.6.0",
-        "matplotlib>=3.7.0",
-    ],
-    extras_require={
-        "dev": [
-            "pytest>=7.0.0",
-            "pytest-asyncio>=0.21.0",
-        ]
-    },
-)
--- a/web/src/components/ChatPanel.vue
+++ b/web/src/components/ChatPanel.vue
@@ -59,6 +59,18 @@ const addToolCallBubble = (label: string) => {
  }]
 }

+const appendToolCallStatus = (status: string) => {
+  if (!toolCallMessageId) return
+  const idx = messages.value.findIndex(m => m._id === toolCallMessageId)
+  if (idx !== -1) {
+    messages.value[idx] = {
+      ...messages.value[idx],
+      content: messages.value[idx].content + `\n↳ ${status}`
+    }
+    messages.value = [...messages.value]
+  }
+}
+
 const removeToolCallBubble = () => {
  if (toolCallMessageId) {
    messages.value = messages.value.filter(m => m._id !== toolCallMessageId)
@@ -76,11 +88,47 @@ const streamingImages = ref<any[]>([])
 const handleMessage = (data: WebSocketMessage) => {
  console.log('[ChatPanel] Received message:', data)

+  if (data.type === 'conversation_history') {
+    messages.value = (data.messages as any[]).map((m: any) => {
+      const ts = new Date(m.timestamp / 1000) // microseconds → ms
+      const files = (m.files ?? []).map((b: any) => ({
+        name: `image_${b.id}.png`,
+        size: 0,
+        type: b.mimeType.split('/')[1] ?? 'png',
+        url: `data:${b.mimeType};base64,${b.data}`,
+        preview: `data:${b.mimeType};base64,${b.data}`,
+      }))
+      return {
+        _id: m.id,
+        content: m.content,
+        senderId: m.role === 'user' ? CURRENT_USER_ID : AGENT_ID,
+        timestamp: ts.toTimeString().split(' ')[0].slice(0, 5),
+        date: ts.toLocaleDateString(),
+        saved: true,
+        distributed: true,
+        seen: true,
+        files,
+      }
+    })
+    messagesLoaded.value = true
+    return
+  }
+
  if (data.type === 'agent_tool_call') {
    addToolCallBubble(data.label ?? data.toolName ?? 'Tool call...')
    return
  }

+  if (data.type === 'subagent_tool_call') {
+    appendToolCallStatus(data.toolName ?? data.label ?? 'tool')
+    return
+  }
+
+  if (data.type === 'subagent_chunk') {
+    // Subagent final text — not shown separately; the main agent will incorporate it in its response
+    return
+  }
+
  if (data.type === 'image') {
    // Handle image message - attach to current streaming message or create standalone
    console.log('[ChatPanel] Processing image message')
--- a/web/src/composables/useStateSync.ts
+++ b/web/src/composables/useStateSync.ts
@@ -3,6 +3,24 @@ import * as jsonpatch from 'fast-json-patch';
 import type { BackendMessage, FrontendMessage, HelloMessage, PatchMessage } from '../types/sync';
 import { wsManager } from './useWebSocket';

+function deepReplace(target: Record<string, any>, source: Record<string, any>) {
+  for (const key of Object.keys(target)) {
+    if (!(key in source)) {
+      delete target[key]
+    }
+  }
+  for (const [key, value] of Object.entries(source)) {
+    if (
+      value !== null && typeof value === 'object' && !Array.isArray(value) &&
+      target[key] !== null && typeof target[key] === 'object' && !Array.isArray(target[key])
+    ) {
+      deepReplace(target[key], value)
+    } else {
+      target[key] = value
+    }
+  }
+}
+
 export function useStateSync(stores: Record<string, Store>) {
  console.log('[StateSync] Initializing with stores:', Object.keys(stores));

@@ -35,7 +53,7 @@ export function useStateSync(stores: Record<string, Store>) {
      if (store) {
        console.log('[StateSync] Applying snapshot state:', msg.state);
        isApplyingBackendPatch[msg.store] = true;
-        store.$patch(msg.state);
+        store.$patch((state) => deepReplace(state as Record<string, any>, msg.state as Record<string, any>));
        // Update previousState to stay in sync
        previousStates[msg.store] = JSON.parse(JSON.stringify(store.$state));
        isApplyingBackendPatch[msg.store] = false;
@@ -64,7 +82,7 @@ export function useStateSync(stores: Record<string, Store>) {
        const newState = jsonpatch.applyPatch(currentState, msg.patch, false, false).newDocument;
        console.log('[StateSync] New state after patch:', newState);
        isApplyingBackendPatch[msg.store] = true;
-        store.$patch(newState);
+        store.$patch((state) => deepReplace(state as Record<string, any>, newState as Record<string, any>));
        // Update previousState to stay in sync
        previousStates[msg.store] = JSON.parse(JSON.stringify(store.$state));
        isApplyingBackendPatch[msg.store] = false;
--- a/web/src/composables/useWebSocket.ts
+++ b/web/src/composables/useWebSocket.ts
@@ -123,8 +123,9 @@ class WebSocketManager {
        this.statusMessage.value = ''
        console.log('WebSocket disconnected:', event.code, event.reason)

-        // Attempt to reconnect if we have a token
-        if (this.token && !event.wasClean) {
+        // Attempt to reconnect if we have a token and it wasn't an intentional close.
+        // Check code instead of wasClean: code 1005 has wasClean=true but still needs retry.
+        if (this.token && event.code !== 1000 && event.code !== 1001) {
          this.scheduleReconnect()
        }
      }