data pipeline refactor and fix

2026-04-13 18:30:04 -04:00
parent 6418729b16
commit 326bf80846
96 changed files with 7107 additions and 1763 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 /backend.old/data
 /backend.old/uploads/
 chat/
 bin/create-all-users
 # Environment variables
 .env
@@ -114,6 +115,9 @@ deploy/k8s/prod/secrets/*.yaml
 # Dev environment image tags
 .dev-image-tag
 # Dev gateway-config is generated from gateway-config.yaml.tpl by bin/dev
 deploy/k8s/dev/configs/gateway-config.yaml
 # Protobuf copies (canonical files are in /protobuf/)
 flink/protobuf/
 relay/protobuf/
--- a/bin/create-user
+++ b/bin/create-user
@@ -21,6 +21,10 @@ usage() {
 }
 ENV="${1:-dev}"
 ARG_EMAIL="${2:-}"
 ARG_PASSWORD="${3:-}"
 ARG_NAME="${4:-}"
 ARG_LICENSE="${5:-}"
 if [[ "$ENV" != "dev" && "$ENV" != "prod" ]]; then
    echo -e "${RED}Error: Environment must be 'dev' or 'prod'${NC}"
@@ -44,16 +48,36 @@ if [ -z "$PG_POD" ]; then
    exit 1
 fi
-# Prompt for credentials
+# Get credentials — from args or interactively
-read -p "Email: " USER_EMAIL
+if [[ -n "$ARG_EMAIL" ]]; then
-read -rs -p "Password (min 8 chars): " USER_PASSWORD
+    USER_EMAIL="$ARG_EMAIL"
-echo ""
+else
    read -p "Email: " USER_EMAIL
 fi
 if [[ -n "$ARG_PASSWORD" ]]; then
    USER_PASSWORD="$ARG_PASSWORD"
 else
    read -rs -p "Password (min 8 chars): " USER_PASSWORD
    echo ""
 fi
 if [[ ${#USER_PASSWORD} -lt 8 ]]; then
    echo -e "${RED}✗ Password must be at least 8 characters${NC}"
    exit 1
 fi
-read -p "Display name: " USER_NAME
+
-read -p "License type [free|pro|enterprise] (default: pro): " LICENSE_TYPE
+if [[ -n "$ARG_NAME" ]]; then
    USER_NAME="$ARG_NAME"
 else
    read -p "Display name: " USER_NAME
 fi
 if [[ -n "$ARG_LICENSE" ]]; then
    LICENSE_TYPE="$ARG_LICENSE"
 else
    read -p "License type [free|pro|enterprise] (default: pro): " LICENSE_TYPE
 fi
 LICENSE_TYPE="${LICENSE_TYPE:-pro}"
 # Check if user already exists
--- a/bin/deploy
+++ b/bin/deploy
@@ -43,7 +43,7 @@ if [ "$PROJECT" == "dev" ]; then
 fi
 if [ "$DEV" == "1" ]; then
-  TAG="dev`date +%Y%m%d%H%M%S`"
+  TAG="dev`date -u +%Y%m%d%H%M%S`"
  if [ "$1" != "" ]; then
    CONFIG=$1
    shift
--- a/bin/deploy-all
+++ b/bin/deploy-all
@@ -0,0 +1,158 @@
 #!/usr/bin/env bash
 set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 KUBECTL="kubectl --context=prod"
 CLEAR_SANDBOXES=0
 usage() {
    echo "Usage: $0 [--sandboxes]"
    echo ""
    echo "Deploy all services to production. Does NOT update secrets (use bin/secret-update)."
    echo ""
    echo "Steps performed:"
    echo "  1. Apply base kustomize manifests (namespaces, RBAC, policies)"
    echo "  2. Apply infrastructure.yaml (statefulsets, deployments)"
    echo "  3. Run bin/config-update prod"
    echo "  4. Build and deploy all application images"
    echo "  5. Wait for rollouts"
    echo ""
    echo "Options:"
    echo "  --sandboxes   Delete sandbox Deployments and Services (PVCs are retained)."
    echo "                      The gateway will recreate sandboxes on next user login."
    echo ""
    exit 1
 }
 for arg in "$@"; do
    case "$arg" in
        --sandboxes)
            CLEAR_SANDBOXES=1
            ;;
        --help|-h)
            usage
            ;;
        *)
            echo -e "${RED}Unknown argument: $arg${NC}"
            usage
            ;;
    esac
 done
 echo -e "${YELLOW}╔══════════════════════════════════════════╗${NC}"
 echo -e "${YELLOW}║   PRODUCTION FULL DEPLOY                 ║${NC}"
 echo -e "${YELLOW}╚══════════════════════════════════════════╝${NC}"
 echo ""
 echo -e "${YELLOW}⚠️  This will update ALL production services.${NC}"
 echo -e "${YELLOW}   Secrets are NOT updated (run bin/secret-update prod separately).${NC}"
 if [ "$CLEAR_SANDBOXES" == "1" ]; then
    echo -e "${YELLOW}   Sandbox deployments will be DELETED (PVCs retained).${NC}"
 fi
 echo ""
 read -p "Are you sure you want to continue? (yes/no): " confirm
 if [[ "$confirm" != "yes" ]]; then
    echo "Aborted."
    exit 0
 fi
 step() {
    echo ""
    echo -e "${BLUE}━━━ $1 ━━━${NC}"
 }
 ok() {
    echo -e "${GREEN}✓${NC} $1"
 }
 fail() {
    echo -e "${RED}✗ $1${NC}"
    exit 1
 }
 # ── Step 1: Base kustomize manifests ─────────────────────────────────────────
 step "Step 1/5: Applying base kustomize manifests"
 cd "$ROOT_DIR"
 $KUBECTL apply -k deploy/k8s/prod/
 ok "Base manifests applied (namespaces, RBAC, policies, quotas)"
 # ── Step 2: Infrastructure ────────────────────────────────────────────────────
 step "Step 2/5: Applying infrastructure.yaml"
 $KUBECTL -n ai apply -f deploy/k8s/prod/infrastructure.yaml
 ok "Infrastructure applied"
 # ── Step 3: Configs ───────────────────────────────────────────────────────────
 step "Step 3/5: Updating configs"
 # config-update prod will prompt for confirmation; we already confirmed above,
 # so feed "yes" automatically via stdin.
 echo "yes" | "$SCRIPT_DIR/config-update" prod
 ok "Configs updated"
 # ── Step 4: Build and deploy all application images ───────────────────────────
 step "Step 4/5: Building and deploying application images"
 echo ""
 SERVICES=(gateway web sandbox lifecycle-sidecar flink relay ingestor)
 for service in "${SERVICES[@]}"; do
    echo -e "${GREEN}→${NC} Deploying $service..."
    "$SCRIPT_DIR/deploy" "$service" prod
    ok "$service deployed"
    echo ""
 done
 # ── Step 4b: Optionally clear sandbox deployments ─────────────────────────────
 if [ "$CLEAR_SANDBOXES" == "1" ]; then
    step "Step 4b: Clearing sandbox deployments"
    SANDBOX_DEPLOYS=$($KUBECTL -n sandbox get deployments -o name 2>/dev/null || true)
    SANDBOX_SVCS=$($KUBECTL -n sandbox get services -o name 2>/dev/null || true)
    if [ -z "$SANDBOX_DEPLOYS" ]; then
        echo "  No sandbox deployments found."
    else
        echo "  Deleting sandbox deployments..."
        echo "$SANDBOX_DEPLOYS" | xargs $KUBECTL -n sandbox delete
        ok "Sandbox deployments deleted"
    fi
    if [ -n "$SANDBOX_SVCS" ]; then
        echo "  Deleting sandbox services..."
        echo "$SANDBOX_SVCS" | xargs $KUBECTL -n sandbox delete
        ok "Sandbox services deleted"
    fi
    echo -e "${YELLOW}  PVCs retained — gateway will recreate sandboxes on next login.${NC}"
 fi
 # ── Step 5: Wait for rollouts ─────────────────────────────────────────────────
 step "Step 5/5: Waiting for rollouts"
 ROLLOUTS=(
    "deployment/gateway"
    "deployment/ai-web"
    "deployment/relay"
    "deployment/ingestor"
    "deployment/flink-jobmanager"
    "deployment/flink-taskmanager"
 )
 for r in "${ROLLOUTS[@]}"; do
    echo -e "${GREEN}→${NC} Waiting for $r..."
    $KUBECTL -n ai rollout status "$r" --timeout=180s || echo -e "${YELLOW}  ⚠ $r did not become ready within 3 minutes${NC}"
 done
 echo ""
 echo -e "${GREEN}╔══════════════════════════════════════════╗${NC}"
 echo -e "${GREEN}║   Deploy complete!                       ║${NC}"
 echo -e "${GREEN}╚══════════════════════════════════════════╝${NC}"
 echo ""
 echo "  Verify: curl -I https://dexorder.ai/api/health"
 echo ""
--- a/bin/dev
+++ b/bin/dev
@@ -99,6 +99,12 @@ start_minikube() {
    fi
 }
 generate_gateway_config_dev() {
    sed "s|SANDBOX_IMAGE_TAG|dexorder/ai-sandbox:$SANDBOX_TAG|g; s|SIDECAR_IMAGE_TAG|dexorder/ai-lifecycle-sidecar:$SIDECAR_TAG|g" \
        "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml.tpl" \
        > "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
 }
 rebuild_images() {
    local service="${1:-all}"
    echo -e "${BLUE}Building custom images...${NC}"
@@ -221,12 +227,7 @@ deploy_services() {
    # Update configs
    echo -e "${GREEN}→${NC} Updating configs..."
-    # Template gateway-config.yaml with actual image tags (backup first for safe restore)
+    generate_gateway_config_dev
    local _gw_bak
    _gw_bak=$(mktemp)
    cp "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml" "$_gw_bak"
    sed -i "s|sandbox_image: dexorder/ai-sandbox:.*|sandbox_image: dexorder/ai-sandbox:$SANDBOX_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
    sed -i "s|sidecar_image: dexorder/ai-lifecycle-sidecar:.*|sidecar_image: dexorder/ai-lifecycle-sidecar:$SIDECAR_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
    "$SCRIPT_DIR/config-update" dev
@@ -264,10 +265,6 @@ EOF
    # Clean up the appended image tags from kustomization.yaml
    sed -i '/# Image tags (added by bin\/dev)/,$d' kustomization.yaml
    # Restore gateway-config.yaml from backup
    cp "$_gw_bak" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
    rm "$_gw_bak"
    echo -e "${GREEN}✓ Services deployed${NC}"
    echo ""
@@ -525,6 +522,9 @@ deep_restart() {
            ;;
    esac
    echo -e "${GREEN}→${NC} Rebuilding application images..."
    rebuild_images
    echo -e "${GREEN}→${NC} Redeploying services..."
    deploy_services
@@ -589,11 +589,7 @@ deploy_service() {
        gateway)
            image_name="dexorder/ai-gateway"
            image_tag="$GATEWAY_TAG"
-            # Also need to template gateway-config.yaml (backup for safe restore)
+            generate_gateway_config_dev
            _gw_bak_single=$(mktemp)
            cp "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml" "$_gw_bak_single"
            sed -i "s|sandbox_image: dexorder/ai-sandbox:.*|sandbox_image: dexorder/ai-sandbox:$SANDBOX_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
            sed -i "s|sidecar_image: dexorder/ai-lifecycle-sidecar:.*|sidecar_image: dexorder/ai-lifecycle-sidecar:$SIDECAR_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
            "$SCRIPT_DIR/config-update" dev
            ;;
        web)
@@ -623,12 +619,6 @@ EOF
    # Clean up the appended image tags from kustomization.yaml
    sed -i '/# Image tags (added by bin\/dev)/,$d' kustomization.yaml
    # Restore gateway-config.yaml from backup if we modified it
    if [ "$service" == "gateway" ]; then
        cp "$_gw_bak_single" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
        rm "$_gw_bak_single"
    fi
    echo -e "${GREEN}✓ $service deployed${NC}"
 }
@@ -713,15 +703,10 @@ case "$COMMAND" in
                cd "$ROOT_DIR/deploy/k8s/dev"
-                # Template gateway-config if gateway is in the list (backup for safe restore)
+                # Regenerate gateway-config if gateway is in the list
                _ms_gw_bak=""
                for svc in "${deploy_services_list[@]}"; do
                    if [ "$svc" == "gateway" ]; then
-                        _ms_gw_bak=$(mktemp)
+                        generate_gateway_config_dev
                        cp "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml" "$_ms_gw_bak"
                        sed -i "s|sandbox_image: dexorder/ai-sandbox:.*|sandbox_image: dexorder/ai-sandbox:$SANDBOX_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
                        sed -i "s|sidecar_image: dexorder/ai-lifecycle-sidecar:.*|sidecar_image: dexorder/ai-lifecycle-sidecar:$SIDECAR_TAG|g" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
                        "$SCRIPT_DIR/config-update" dev
                        break
                    fi
                done
@@ -744,11 +729,6 @@ case "$COMMAND" in
                sed -i '/# Image tags (added by bin\/dev)/,$d' kustomization.yaml
                # Restore gateway-config from backup if we modified it
                if [ -n "$_ms_gw_bak" ]; then
                    cp "$_ms_gw_bak" "$ROOT_DIR/deploy/k8s/dev/configs/gateway-config.yaml"
                    rm "$_ms_gw_bak"
                fi
            fi
            # Handle sandbox separately
--- a/bin/init
+++ b/bin/init
@@ -45,6 +45,29 @@ else
    MCP_URL="http://localhost:8080/mcp"
 fi
 # ---------- MinIO Bucket Initialization ----------
 echo ""
 echo -e "${BLUE}=== MinIO Storage Setup ===${NC}"
 echo ""
 echo -e "${BLUE}Waiting for MinIO pod...${NC}"
 $KUBECTL wait --for=condition=ready --timeout=120s pod -l app=minio 2>/dev/null || {
    echo -e "${YELLOW}⚠️  MinIO not ready after 120s, skipping bucket setup${NC}"
 }
 MINIO_POD=$($KUBECTL get pods -l app=minio -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
 if [ -n "$MINIO_POD" ]; then
    echo -e "${GREEN}→${NC} Ensuring warehouse bucket exists..."
    MINIO_USER=$($KUBECTL exec "$MINIO_POD" -- sh -c 'echo $MINIO_ROOT_USER' 2>/dev/null | tr -d '\r')
    MINIO_PASS=$($KUBECTL exec "$MINIO_POD" -- sh -c 'echo $MINIO_ROOT_PASSWORD' 2>/dev/null | tr -d '\r')
    $KUBECTL exec "$MINIO_POD" -- mc alias set local http://localhost:9000 "$MINIO_USER" "$MINIO_PASS" > /dev/null 2>&1
    $KUBECTL exec "$MINIO_POD" -- mc mb --ignore-existing local/warehouse > /dev/null 2>&1
    echo -e "${GREEN}✓ Warehouse bucket ready${NC}"
 else
    echo -e "${YELLOW}⚠️  MinIO pod not found, skipping bucket setup${NC}"
 fi
 # ---------- Schema Initialization ----------
 echo ""
--- a/deploy/k8s/dev/configs/gateway-config.yaml.tpl
+++ b/deploy/k8s/dev/configs/gateway-config.yaml.tpl
@@ -56,8 +56,8 @@ data:
      namespace: sandbox
      service_namespace: default
      in_cluster: true
-      sandbox_image: dexorder/ai-sandbox:dev20260409143116
+      sandbox_image: SANDBOX_IMAGE_TAG
-      sidecar_image: dexorder/ai-lifecycle-sidecar:dev20260408103634
+      sidecar_image: SIDECAR_IMAGE_TAG
      storage_class: standard
      image_pull_policy: Never  # For minikube dev - use local images
--- a/deploy/k8s/dev/configs/ingestor-config.yaml
+++ b/deploy/k8s/dev/configs/ingestor-config.yaml
@@ -1,9 +1,8 @@
 # CCXT Ingestor Configuration
-# Relay ZMQ endpoints (relay is the well-known gateway)
+# Flink IngestorBroker (ROUTER) endpoint
-flink_hostname: relay
+flink_hostname: flink-jobmanager
-ingestor_work_port: 5555    # SUB - receives DataRequest with exchange prefix
+ingestor_broker_port: 5567
 # Note: No response port needed - async architecture via Kafka!
 # Supported exchanges (subscribe to these prefixes)
 supported_exchanges:
--- a/deploy/k8s/dev/infrastructure.yaml
+++ b/deploy/k8s/dev/infrastructure.yaml
@@ -455,6 +455,14 @@ spec:
      protocol: TCP
      port: 5561
      targetPort: 5561
    - name: zmq-client-pull
      protocol: TCP
      port: 5566
      targetPort: 5566
    - name: zmq-ingestor-broker
      protocol: TCP
      port: 5567
      targetPort: 5567
  type: ClusterIP
 ---
 apiVersion: apps/v1
@@ -583,14 +591,6 @@ spec:
  selector:
    app: relay
  ports:
    - name: work-queue
      protocol: TCP
      port: 5555
      targetPort: 5555
    - name: responses
      protocol: TCP
      port: 5556
      targetPort: 5556
    - name: market-data
      protocol: TCP
      port: 5558
@@ -620,10 +620,6 @@ spec:
        image: dexorder/ai-relay
        imagePullPolicy: Never
        ports:
        - containerPort: 5555
          name: work-queue
        - containerPort: 5556
          name: responses
        - containerPort: 5558
          name: market-data
        - containerPort: 5559
@@ -657,9 +653,9 @@ spec:
        app: ingestor
    spec:
      initContainers:
-      - name: wait-for-relay
+      - name: wait-for-flink
        image: busybox:1.36
-        command: ['sh', '-c', 'until nc -z relay 5555; do echo waiting for relay; sleep 2; done;']
+        command: ['sh', '-c', 'until nc -z flink-jobmanager 5567; do echo waiting for flink broker; sleep 2; done;']
      - name: wait-for-kafka
        image: busybox:1.36
        command: ['sh', '-c', 'until nc -z kafka 9092; do echo waiting for kafka; sleep 2; done;']
--- a/deploy/k8s/dev/sandbox-config.yaml
+++ b/deploy/k8s/dev/sandbox-config.yaml
@@ -30,6 +30,7 @@ data:
        namespace: "trading"
        # S3 endpoint for MinIO in default namespace
        s3_endpoint: "http://minio.default.svc.cluster.local:9000"
        s3_region: "us-east-1"
      relay:
        endpoint: "tcp://relay.default.svc.cluster.local:5559"
--- a/deploy/k8s/prod/configs/ingestor-config.yaml
+++ b/deploy/k8s/prod/configs/ingestor-config.yaml
@@ -1,9 +1,8 @@
 # CCXT Ingestor Configuration
-# Relay ZMQ endpoints (relay is the well-known gateway)
+# Flink IngestorBroker (ROUTER) endpoint
-flink_hostname: relay
+flink_hostname: flink-jobmanager
-ingestor_work_port: 5555    # SUB - receives DataRequest with exchange prefix
+ingestor_broker_port: 5567
 # Note: No response port needed - async architecture via Kafka!
 # Supported exchanges (subscribe to these prefixes)
 supported_exchanges:
--- a/deploy/k8s/prod/infrastructure.yaml
+++ b/deploy/k8s/prod/infrastructure.yaml
@@ -451,6 +451,14 @@ spec:
      protocol: TCP
      port: 5561
      targetPort: 5561
    - name: zmq-client-pull
      protocol: TCP
      port: 5566
      targetPort: 5566
    - name: zmq-ingestor-broker
      protocol: TCP
      port: 5567
      targetPort: 5567
  type: ClusterIP
 ---
 apiVersion: apps/v1
@@ -579,14 +587,6 @@ spec:
  selector:
    app: relay
  ports:
    - name: work-queue
      protocol: TCP
      port: 5555
      targetPort: 5555
    - name: responses
      protocol: TCP
      port: 5556
      targetPort: 5556
    - name: market-data
      protocol: TCP
      port: 5558
@@ -616,10 +616,6 @@ spec:
        image: dexorder/ai-relay
        imagePullPolicy: Always
        ports:
        - containerPort: 5555
          name: work-queue
        - containerPort: 5556
          name: responses
        - containerPort: 5558
          name: market-data
        - containerPort: 5559
@@ -653,9 +649,9 @@ spec:
        app: ingestor
    spec:
      initContainers:
-      - name: wait-for-relay
+      - name: wait-for-flink
        image: busybox:1.36
-        command: ['sh', '-c', 'until nc -z relay 5555; do echo waiting for relay; sleep 2; done;']
+        command: ['sh', '-c', 'until nc -z flink-jobmanager 5567; do echo waiting for flink broker; sleep 2; done;']
      - name: wait-for-kafka
        image: busybox:1.36
        command: ['sh', '-c', 'until nc -z kafka 9092; do echo waiting for kafka; sleep 2; done;']
--- a/deploy/k8s/prod/sandbox-config.yaml
+++ b/deploy/k8s/prod/sandbox-config.yaml
@@ -22,6 +22,7 @@ data:
        catalog_uri: "http://iceberg-catalog.ai.svc.cluster.local:8181"
        namespace: "trading"
        s3_endpoint: "http://minio.ai.svc.cluster.local:9000"
        s3_region: "us-east-1"
      relay:
        endpoint: "tcp://relay.ai.svc.cluster.local:5559"
--- a/flink/src/main/java/com/dexorder/flink/TradingFlinkApp.java
+++ b/flink/src/main/java/com/dexorder/flink/TradingFlinkApp.java
@@ -2,7 +2,8 @@ package com.dexorder.flink;
 import com.dexorder.flink.config.AppConfig;
 import com.dexorder.flink.iceberg.SchemaInitializer;
-import com.dexorder.flink.ingestor.IngestorWorkQueue;
+import com.dexorder.flink.ingestor.IngestorBroker;
 import com.dexorder.flink.ingestor.RealtimeSubscriptionManager;
 import com.dexorder.flink.kafka.TopicManager;
 import com.dexorder.flink.publisher.HistoryNotificationForwarder;
 import com.dexorder.flink.publisher.HistoryNotificationFunction;
@@ -10,6 +11,11 @@ import com.dexorder.flink.publisher.OHLCBatchWrapper;
 import com.dexorder.flink.publisher.OHLCBatchDeserializer;
 import com.dexorder.flink.publisher.MarketWrapper;
 import com.dexorder.flink.publisher.MarketDeserializer;
 import com.dexorder.flink.publisher.RealtimeBar;
 import com.dexorder.flink.publisher.RealtimeBarFunction;
 import com.dexorder.flink.publisher.RealtimeBarPublisher;
 import com.dexorder.flink.publisher.TickWrapper;
 import com.dexorder.flink.publisher.TickDeserializer;
 import com.dexorder.flink.sink.HistoricalBatchWriter;
 import com.dexorder.flink.sink.SymbolMetadataWriter;
 import com.dexorder.flink.zmq.ZmqChannelManager;
@@ -83,11 +89,16 @@ public class TradingFlinkApp {
                    catalogProps
            );
            String warehouse = config.getString("iceberg_warehouse", "s3://warehouse/");
            String warehouseBucket = warehouse.replaceFirst("^s3://", "").split("/")[0];
            org.apache.iceberg.catalog.Catalog catalog = catalogLoader.loadCatalog();
            try {
                SchemaInitializer schemaInitializer = new SchemaInitializer(
                        catalog,
-                        config.getIcebergNamespace()
+                        config.getIcebergNamespace(),
                        config.getString("s3_endpoint", "http://minio:9000"),
                        warehouseBucket
                );
                schemaInitializer.initializeSchemas();
            } finally {
@@ -107,20 +118,28 @@ public class TradingFlinkApp {
            zmqManager.initializeChannels();
            LOG.info("ZeroMQ channels initialized");
-            // Initialize history notification forwarder (runs in job manager)
+            // Initialize ingestor broker — manages ROUTER/DEALER work queue for all ingestors
-            // Binds PULL socket to receive notifications from task managers, forwards to MARKET_DATA_PUB
+            IngestorBroker broker = new IngestorBroker(zmqManager);
            broker.start();
            LOG.info("IngestorBroker started");
            // Initialize realtime subscription manager — owns MARKET_DATA_PUB socket exclusively,
            // detects XPUB subscription events, and calls broker for realtime job lifecycle.
            // Other components publish via subscriptionManager.enqueuePublish() (thread-safe).
            RealtimeSubscriptionManager subscriptionManager = new RealtimeSubscriptionManager(zmqManager, broker);
            subscriptionManager.start();
            LOG.info("RealtimeSubscriptionManager started");
            // Initialize history notification forwarder (runs in job manager).
            // Binds PULL socket to receive notifications from task managers, enqueues them for
            // publication via RealtimeSubscriptionManager (sole owner of MARKET_DATA_PUB).
            HistoryNotificationForwarder notificationForwarder = new HistoryNotificationForwarder(
                    config.getNotificationPullPort(),
-                    zmqManager.getSocket(ZmqChannelManager.Channel.MARKET_DATA_PUB)
+                    subscriptionManager::enqueuePublish
            );
            notificationForwarder.start();
            LOG.info("History notification forwarder started on port {}", config.getNotificationPullPort());
            // Initialize ingestor work queue
            IngestorWorkQueue workQueue = new IngestorWorkQueue(zmqManager);
            workQueue.start();
            LOG.info("Ingestor work queue started");
            // Set up Flink streaming environment
            StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
@@ -224,8 +243,37 @@ public class TradingFlinkApp {
            LOG.info("Symbol metadata pipeline configured: SymbolMetadataWriter -> Iceberg -> METADATA_UPDATE notification");
            // Realtime tick pipeline: Kafka market-tick → OHLC bars → ZMQ notify → clients
            KafkaSource<TickWrapper> tickSource = KafkaSource.<TickWrapper>builder()
                    .setBootstrapServers(config.getKafkaBootstrapServers())
                    .setTopics(config.getKafkaTickTopic())
                    .setGroupId("flink-tick-consumer")
                    .setStartingOffsets(OffsetsInitializer.latest())
                    .setValueOnlyDeserializer(new TickDeserializer())
                    .build();
            DataStream<TickWrapper> tickStream = env
                    .fromSource(tickSource, WatermarkStrategy.noWatermarks(), "Tick Kafka Source")
                    .filter(t -> t != null)
                    .setParallelism(1);
            // Aggregate ticks into OHLC bars for each configured period.
            // keyBy ticker so all ticks for a ticker land on the same slot and accumulate together.
            int[] periods = config.getRealtimePeriods();
            DataStream<RealtimeBar> barStream = tickStream
                    .keyBy(TickWrapper::getTicker)
                    .flatMap(new RealtimeBarFunction(periods))
                    .setParallelism(1);
            barStream.addSink(new RealtimeBarPublisher(notificationEndpoint))
                    .setParallelism(1)
                    .name("RealtimeBarPublisher");
            LOG.info("Realtime tick pipeline configured: market-tick → OHLC bars → clients (periods={})",
                    java.util.Arrays.toString(periods));
            // TODO: Set up CEP patterns and triggers
            // TODO: Set up realtime tick processing
            LOG.info("Flink job configured, starting execution");
@@ -233,15 +281,10 @@ public class TradingFlinkApp {
            Runtime.getRuntime().addShutdownHook(new Thread(() -> {
                LOG.info("Shutting down Trading Flink Application");
                try {
                    // Stop work queue
                    workQueue.stop();
                    // Stop notification forwarder
                    notificationForwarder.close();
-
+                    subscriptionManager.stop();
-                    // Close ZMQ channels
+                    broker.stop();
                    zmqManager.close();
                    LOG.info("Shutdown complete");
                } catch (Exception e) {
                    LOG.error("Error during shutdown", e);
--- a/flink/src/main/java/com/dexorder/flink/config/AppConfig.java
+++ b/flink/src/main/java/com/dexorder/flink/config/AppConfig.java
@@ -91,14 +91,20 @@ public class AppConfig {
    }
    // ZMQ port getters
    public int getIngestorWorkQueuePort() {
        return getInt("zmq_ingestor_work_queue_port", 5555);
    }
    public int getMarketDataPubPort() {
        return getInt("zmq_market_data_pub_port", 5558);
    }
    /** Port where Flink's IngestorBroker binds a PULL socket to receive requests from relay PUSH */
    public int getFlinkRequestPullPort() {
        return getInt("zmq_flink_request_pull_port", 5566);
    }
    /** Port where Flink's IngestorBroker binds a ROUTER for ingestor DEALER connections */
    public int getIngestorBrokerPort() {
        return getInt("zmq_ingestor_broker_port", 5567);
    }
    public String getBindAddress() {
        return getString("zmq_bind_address", "tcp://*");
    }
@@ -112,6 +118,20 @@ public class AppConfig {
        return getString("kafka_tick_topic", "market-tick");
    }
    /**
     * Comma-separated OHLC period lengths in seconds for realtime bar computation.
     * Default covers common chart periods: 1m, 5m, 15m, 1h, 4h, 1d.
     */
    public int[] getRealtimePeriods() {
        String raw = getString("realtime_periods", "60,300,900,3600,14400,86400");
        String[] parts = raw.split(",");
        int[] periods = new int[parts.length];
        for (int i = 0; i < parts.length; i++) {
            periods[i] = Integer.parseInt(parts[i].trim());
        }
        return periods;
    }
    public String getKafkaOhlcTopic() {
        return getString("kafka_ohlc_topic", "market-ohlc");
    }
--- a/flink/src/main/java/com/dexorder/flink/iceberg/SchemaInitializer.java
+++ b/flink/src/main/java/com/dexorder/flink/iceberg/SchemaInitializer.java
@@ -9,8 +9,16 @@ import org.apache.iceberg.catalog.TableIdentifier;
 import org.apache.iceberg.types.Types;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider;
 import software.amazon.awssdk.regions.Region;
 import software.amazon.awssdk.services.s3.S3Client;
 import software.amazon.awssdk.services.s3.S3Configuration;
 import software.amazon.awssdk.services.s3.model.CreateBucketRequest;
 import software.amazon.awssdk.services.s3.model.HeadBucketRequest;
 import software.amazon.awssdk.services.s3.model.NoSuchBucketException;
 import java.io.IOException;
 import java.net.URI;
 import static org.apache.iceberg.types.Types.NestedField.optional;
 import static org.apache.iceberg.types.Types.NestedField.required;
@@ -26,10 +34,14 @@ public class SchemaInitializer {
    private final Catalog catalog;
    private final String namespace;
    private final String s3Endpoint;
    private final String warehouseBucket;
-    public SchemaInitializer(Catalog catalog, String namespace) {
+    public SchemaInitializer(Catalog catalog, String namespace, String s3Endpoint, String warehouseBucket) {
        this.catalog = catalog;
        this.namespace = namespace;
        this.s3Endpoint = s3Endpoint;
        this.warehouseBucket = warehouseBucket;
    }
    /**
@@ -40,6 +52,9 @@ public class SchemaInitializer {
    public void initializeSchemas() throws IOException {
        LOG.info("Initializing Iceberg schemas in namespace: {}", namespace);
        // Ensure S3 bucket exists before attempting to create tables
        ensureS3BucketExists();
        // Ensure namespace exists
        ensureNamespaceExists();
@@ -52,6 +67,36 @@ public class SchemaInitializer {
        LOG.info("Schema initialization completed successfully");
    }
    /**
     * Ensure the S3 warehouse bucket exists, creating it if necessary.
     * Runs before any table creation so a fresh MinIO deployment doesn't crash Flink.
     */
    private void ensureS3BucketExists() {
        if (s3Endpoint == null || warehouseBucket == null || warehouseBucket.isEmpty()) {
            LOG.warn("S3 endpoint or warehouse bucket not configured, skipping bucket check");
            return;
        }
        LOG.info("Ensuring S3 bucket '{}' exists at {}", warehouseBucket, s3Endpoint);
        try (S3Client s3 = S3Client.builder()
                .endpointOverride(URI.create(s3Endpoint))
                .region(Region.of("us-east-1"))
                .serviceConfiguration(S3Configuration.builder().pathStyleAccessEnabled(true).build())
                .credentialsProvider(DefaultCredentialsProvider.create())
                .build()) {
            try {
                s3.headBucket(HeadBucketRequest.builder().bucket(warehouseBucket).build());
                LOG.info("S3 bucket '{}' already exists", warehouseBucket);
            } catch (NoSuchBucketException e) {
                LOG.warn("S3 bucket '{}' not found — creating it now", warehouseBucket);
                s3.createBucket(CreateBucketRequest.builder().bucket(warehouseBucket).build());
                LOG.info("Created S3 bucket '{}'", warehouseBucket);
            }
        } catch (Exception e) {
            LOG.error("Failed to ensure S3 bucket '{}' exists at {}", warehouseBucket, s3Endpoint, e);
            throw new RuntimeException("S3 bucket initialization failed for: " + warehouseBucket, e);
        }
    }
    /**
     * Ensure the namespace exists in the catalog.
     */
--- a/flink/src/main/java/com/dexorder/flink/ingestor/IngestorBroker.java
+++ b/flink/src/main/java/com/dexorder/flink/ingestor/IngestorBroker.java
@@ -0,0 +1,503 @@
 package com.dexorder.flink.ingestor;
 import com.dexorder.flink.zmq.ZmqChannelManager;
 import com.dexorder.proto.DataRequest;
 import com.dexorder.proto.RealtimeParams;
 import com.dexorder.proto.SubmitHistoricalRequest;
 import com.dexorder.proto.WorkComplete;
 import com.dexorder.proto.WorkHeartbeat;
 import com.dexorder.proto.WorkReject;
 import com.dexorder.proto.WorkStop;
 import com.dexorder.proto.WorkerReady;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.zeromq.ZMQ;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Deque;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Queue;
 import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentLinkedQueue;
 /**
 * LRU-style work broker for ingestors.
 *
 * Ingestors connect via DEALER to the ROUTER socket on port 5567. They register with READY,
 * are dispatched WORK messages, and respond with COMPLETE (historical) or HEARTBEAT (realtime).
 * If a heartbeat times out the job is re-queued and dispatched to another available worker.
 *
 * Also receives SubmitHistoricalRequest messages forwarded by the relay on the PULL socket (5566).
 *
 * Message type IDs (ZMQ framing, not Kafka):
 *   0x10  SubmitHistoricalRequest (relay → Flink via PULL, same as client wire type)
 *   0x20  WorkerReady             (ingestor → Flink)
 *   0x21  WorkComplete            (ingestor → Flink)
 *   0x22  WorkHeartbeat           (ingestor → Flink)
 *   0x23  WorkReject              (ingestor → Flink)
 *   0x01  DataRequest/WorkAssign  (Flink → ingestor via ROUTER)
 *   0x25  WorkStop                (Flink → ingestor via ROUTER)
 */
 public class IngestorBroker implements AutoCloseable {
    private static final Logger LOG = LoggerFactory.getLogger(IngestorBroker.class);
    private static final byte PROTOCOL_VERSION = 0x01;
    private static final byte MSG_TYPE_SUBMIT_REQUEST   = 0x10;
    private static final byte MSG_TYPE_WORKER_READY     = 0x20;
    private static final byte MSG_TYPE_WORK_COMPLETE    = 0x21;
    private static final byte MSG_TYPE_WORK_HEARTBEAT   = 0x22;
    private static final byte MSG_TYPE_WORK_REJECT      = 0x23;
    private static final byte MSG_TYPE_WORK_ASSIGN      = 0x01;  // DataRequest type on wire
    private static final byte MSG_TYPE_WORK_STOP        = 0x25;
    /** Re-queue realtime job if no heartbeat received within this window (ms) */
    private static final long HEARTBEAT_TIMEOUT_MS = 25_000;
    /** Re-queue historical job if not completed within this window (ms) */
    private static final long HISTORICAL_TIMEOUT_MS = 60_000;
    private final ZmqChannelManager zmqManager;
    private volatile boolean running;
    private Thread brokerThread;
    // ── Worker tracking ──────────────────────────────────────────────────────
    /** Workers ready to accept a job, in LRU order (head = least recently used) */
    private final Deque<WorkerInfo> freeWorkers = new ArrayDeque<>();
    /** Jobs waiting for a compatible free worker */
    private final Queue<DataRequest> pendingJobs = new ArrayDeque<>();
    /** Jobs currently executing on a worker */
    private final Map<String, ActiveJob> activeJobs = new ConcurrentHashMap<>();
    /** Worker identity → supported exchanges (set once on READY) */
    private final Map<String, WorkerInfo> knownWorkers = new ConcurrentHashMap<>();
    // ── Thread-safe inbound queue from RealtimeSubscriptionManager ───────────
    private final Queue<DataRequest> externalSubmissions = new ConcurrentLinkedQueue<>();
    public IngestorBroker(ZmqChannelManager zmqManager) {
        this.zmqManager = zmqManager;
    }
    public void start() {
        if (running) {
            LOG.warn("IngestorBroker already running");
            return;
        }
        running = true;
        brokerThread = new Thread(this::brokerLoop, "IngestorBroker-Thread");
        brokerThread.setDaemon(false);
        brokerThread.start();
        LOG.info("IngestorBroker started");
    }
    public void stop() {
        running = false;
        if (brokerThread != null) {
            brokerThread.interrupt();
            try {
                brokerThread.join(5000);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
        }
        LOG.info("IngestorBroker stopped");
    }
    /**
     * Submit a realtime data request from outside the broker thread (thread-safe).
     * Called by RealtimeSubscriptionManager when subscription ref count goes 0→1.
     */
    public void submitRealtimeRequest(String ticker) {
        String jobId = UUID.randomUUID().toString();
        DataRequest request = DataRequest.newBuilder()
                .setRequestId(jobId)
                .setJobId(jobId)
                .setType(DataRequest.RequestType.REALTIME_TICKS)
                .setTicker(ticker)
                .setRealtime(RealtimeParams.newBuilder()
                        .setIncludeTicks(true)
                        .setIncludeOhlc(false)
                        .build())
                .build();
        externalSubmissions.add(request);
        LOG.info("Enqueued realtime request: ticker={}, jobId={}", ticker, jobId);
    }
    /**
     * Stop all realtime jobs for a ticker (called when last subscriber leaves).
     * Thread-safe — posts a stop marker via externalSubmissions is complex; instead we
     * directly find and stop active jobs. Protected by ConcurrentHashMap.
     */
    public void stopRealtimeJobsForTicker(String ticker) {
        List<String> toStop = new ArrayList<>();
        for (Map.Entry<String, ActiveJob> entry : activeJobs.entrySet()) {
            if (entry.getValue().ticker.equals(ticker) &&
                entry.getValue().type == DataRequest.RequestType.REALTIME_TICKS) {
                toStop.add(entry.getKey());
            }
        }
        for (String jobId : toStop) {
            ActiveJob job = activeJobs.remove(jobId);
            if (job != null) {
                sendStop(job.workerIdentity, jobId);
                LOG.info("Sent STOP to ingestor: ticker={}, jobId={}", ticker, jobId);
            }
        }
    }
    // ── Broker loop ──────────────────────────────────────────────────────────
    private void brokerLoop() {
        ZMQ.Socket pullSocket   = zmqManager.getSocket(ZmqChannelManager.Channel.CLIENT_REQUEST);
        ZMQ.Socket routerSocket = zmqManager.getSocket(ZmqChannelManager.Channel.INGESTOR_BROKER);
        ZMQ.Poller poller = zmqManager.createPoller(2);
        poller.register(pullSocket, ZMQ.Poller.POLLIN);
        poller.register(routerSocket, ZMQ.Poller.POLLIN);
        LOG.info("IngestorBroker loop running");
        while (running) {
            try {
                // Drain external submissions (realtime requests from subscription manager)
                DataRequest ext;
                while ((ext = externalSubmissions.poll()) != null) {
                    enqueueJob(ext);
                }
                // Poll sockets (100ms timeout)
                poller.poll(100);
                if (poller.pollin(0)) {
                    handleClientRequest(pullSocket);
                }
                if (poller.pollin(1)) {
                    handleWorkerMessage(routerSocket);
                }
                // Check for heartbeat / completion timeouts
                checkTimeouts();
            } catch (Exception e) {
                if (running) {
                    LOG.error("Error in broker loop", e);
                }
            }
        }
        LOG.info("IngestorBroker loop exited");
    }
    /** Receive a SubmitHistoricalRequest forwarded by relay and enqueue it. */
    private void handleClientRequest(ZMQ.Socket pullSocket) {
        byte[] versionFrame = pullSocket.recv(ZMQ.DONTWAIT);
        if (versionFrame == null) return;
        if (!pullSocket.hasReceiveMore()) return;
        byte[] messageFrame = pullSocket.recv(0);
        if (messageFrame == null || messageFrame.length < 2) return;
        if (versionFrame.length != 1 || versionFrame[0] != PROTOCOL_VERSION) {
            LOG.warn("Bad protocol version on PULL socket");
            return;
        }
        byte msgType = messageFrame[0];
        byte[] payload = Arrays.copyOfRange(messageFrame, 1, messageFrame.length);
        if (msgType != MSG_TYPE_SUBMIT_REQUEST) {
            LOG.warn("Unexpected message type on PULL socket: 0x{}", Integer.toHexString(msgType & 0xFF));
            return;
        }
        try {
            SubmitHistoricalRequest req = SubmitHistoricalRequest.parseFrom(payload);
            String jobId = UUID.randomUUID().toString();
            DataRequest dataRequest = DataRequest.newBuilder()
                    .setRequestId(req.getRequestId())
                    .setJobId(jobId)
                    .setType(DataRequest.RequestType.HISTORICAL_OHLC)
                    .setTicker(req.getTicker())
                    .setHistorical(com.dexorder.proto.HistoricalParams.newBuilder()
                            .setStartTime(req.getStartTime())
                            .setEndTime(req.getEndTime())
                            .setPeriodSeconds(req.getPeriodSeconds())
                            .build())
                    .setClientId(req.hasClientId() ? req.getClientId() : "")
                    .build();
            enqueueJob(dataRequest);
            LOG.info("Received historical request from relay: request_id={}, ticker={}", req.getRequestId(), req.getTicker());
        } catch (Exception e) {
            LOG.error("Failed to parse SubmitHistoricalRequest from relay", e);
        }
    }
    /** Receive and dispatch a message from an ingestor DEALER. */
    private void handleWorkerMessage(ZMQ.Socket routerSocket) {
        // ROUTER frame layout: [identity][empty][version][type+payload]
        byte[] identity = routerSocket.recv(ZMQ.DONTWAIT);
        if (identity == null) return;
        if (!routerSocket.hasReceiveMore()) return;
        routerSocket.recv(0); // empty delimiter
        if (!routerSocket.hasReceiveMore()) return;
        byte[] versionFrame = routerSocket.recv(0);
        if (!routerSocket.hasReceiveMore()) return;
        byte[] messageFrame = routerSocket.recv(0);
        if (versionFrame == null || versionFrame.length != 1 || versionFrame[0] != PROTOCOL_VERSION) {
            LOG.warn("Bad protocol version from ingestor");
            return;
        }
        if (messageFrame == null || messageFrame.length < 1) return;
        byte msgType = messageFrame[0];
        byte[] payload = Arrays.copyOfRange(messageFrame, 1, messageFrame.length);
        String identityKey = bytesToHex(identity);
        try {
            switch (msgType & 0xFF) {
                case 0x20: handleWorkerReady(identity, identityKey, payload); break;
                case 0x21: handleWorkComplete(identityKey, payload); break;
                case 0x22: handleWorkHeartbeat(identityKey, payload); break;
                case 0x23: handleWorkReject(identityKey, payload); break;
                default:
                    LOG.warn("Unknown message type from ingestor: 0x{}", Integer.toHexString(msgType & 0xFF));
            }
        } catch (Exception e) {
            LOG.error("Error handling worker message type 0x{}", Integer.toHexString(msgType & 0xFF), e);
        }
    }
    private void handleWorkerReady(byte[] identity, String identityKey, byte[] payload) throws Exception {
        WorkerReady ready = WorkerReady.parseFrom(payload);
        Set<String> exchanges = new HashSet<>(ready.getExchangesList());
        WorkerInfo worker = knownWorkers.computeIfAbsent(identityKey,
                k -> new WorkerInfo(identity, identityKey, exchanges));
        worker.exchanges = exchanges; // update in case re-READY with different config
        worker.identity = identity;
        if (!freeWorkers.contains(worker)) {
            freeWorkers.addLast(worker);
        }
        LOG.info("Ingestor READY: id={}, exchanges={}, freeWorkers={}", identityKey, exchanges, freeWorkers.size());
        dispatchPending();
    }
    private void handleWorkComplete(String identityKey, byte[] payload) throws Exception {
        WorkComplete complete = WorkComplete.parseFrom(payload);
        String jobId = complete.getJobId();
        ActiveJob job = activeJobs.remove(jobId);
        if (job == null) {
            LOG.warn("COMPLETE for unknown jobId={}", jobId);
        } else {
            LOG.info("Job COMPLETE: jobId={}, ticker={}, success={}", jobId, job.ticker, complete.getSuccess());
        }
        // Worker is free again
        WorkerInfo worker = knownWorkers.get(identityKey);
        if (worker != null) {
            freeWorkers.addLast(worker);
            dispatchPending();
        }
    }
    private void handleWorkHeartbeat(String identityKey, byte[] payload) throws Exception {
        WorkHeartbeat hb = WorkHeartbeat.parseFrom(payload);
        String jobId = hb.getJobId();
        ActiveJob job = activeJobs.get(jobId);
        if (job != null) {
            job.lastHeartbeat = System.currentTimeMillis();
        } else {
            LOG.warn("HEARTBEAT for unknown jobId={} from worker={}", jobId, identityKey);
        }
    }
    private void handleWorkReject(String identityKey, byte[] payload) throws Exception {
        WorkReject reject = WorkReject.parseFrom(payload);
        String jobId = reject.getJobId();
        LOG.warn("Job REJECTED by worker={}: jobId={}, reason={}", identityKey, jobId, reject.getReason());
        ActiveJob job = activeJobs.remove(jobId);
        if (job != null) {
            // Re-queue with fresh job_id so a different ingestor may pick it up
            DataRequest requeued = job.request.toBuilder()
                    .setJobId(UUID.randomUUID().toString())
                    .build();
            pendingJobs.add(requeued);
        }
        // Worker is still free (it rejected, not crashed)
        WorkerInfo worker = knownWorkers.get(identityKey);
        if (worker != null) {
            freeWorkers.addLast(worker);
            dispatchPending();
        }
    }
    // ── Dispatch ─────────────────────────────────────────────────────────────
    private void enqueueJob(DataRequest request) {
        // Check if we can immediately dispatch
        WorkerInfo worker = findFreeWorker(exchangeOf(request.getTicker()));
        if (worker != null) {
            dispatch(worker, request);
        } else {
            pendingJobs.add(request);
            LOG.debug("No free worker for {}, queued (pendingJobs={})", request.getTicker(), pendingJobs.size());
        }
    }
    private void dispatchPending() {
        Queue<DataRequest> remaining = new ArrayDeque<>();
        DataRequest job;
        while ((job = pendingJobs.poll()) != null) {
            WorkerInfo worker = findFreeWorker(exchangeOf(job.getTicker()));
            if (worker != null) {
                dispatch(worker, job);
            } else {
                remaining.add(job);
            }
        }
        pendingJobs.addAll(remaining);
    }
    private void dispatch(WorkerInfo worker, DataRequest request) {
        freeWorkers.remove(worker);
        try {
            byte[] protoBytes = request.toByteArray();
            boolean sent = zmqManager.sendToWorker(worker.identity, PROTOCOL_VERSION, MSG_TYPE_WORK_ASSIGN, protoBytes);
            if (!sent) {
                LOG.error("Failed to dispatch job to worker={}, re-queuing", worker.identityKey);
                freeWorkers.addLast(worker);
                pendingJobs.add(request);
                return;
            }
            ActiveJob active = new ActiveJob(worker.identity, worker.identityKey,
                    request, request.getTicker(), request.getType());
            activeJobs.put(request.getJobId(), active);
            LOG.info("Dispatched job: jobId={}, ticker={}, type={}, worker={}",
                    request.getJobId(), request.getTicker(), request.getType(), worker.identityKey);
        } catch (Exception e) {
            LOG.error("Error dispatching job", e);
            freeWorkers.addLast(worker);
        }
    }
    private void sendStop(byte[] workerIdentity, String jobId) {
        try {
            WorkStop stop = WorkStop.newBuilder().setJobId(jobId).build();
            zmqManager.sendToWorker(workerIdentity, PROTOCOL_VERSION, MSG_TYPE_WORK_STOP, stop.toByteArray());
        } catch (Exception e) {
            LOG.error("Error sending STOP for jobId={}", jobId, e);
        }
    }
    // ── Timeout checking ─────────────────────────────────────────────────────
    private void checkTimeouts() {
        long now = System.currentTimeMillis();
        List<String> timedOut = new ArrayList<>();
        for (Map.Entry<String, ActiveJob> entry : activeJobs.entrySet()) {
            ActiveJob job = entry.getValue();
            long timeout = job.type == DataRequest.RequestType.REALTIME_TICKS
                    ? HEARTBEAT_TIMEOUT_MS : HISTORICAL_TIMEOUT_MS;
            if (now - job.lastHeartbeat > timeout) {
                timedOut.add(entry.getKey());
            }
        }
        for (String jobId : timedOut) {
            ActiveJob job = activeJobs.remove(jobId);
            if (job == null) continue;
            LOG.warn("Job timed out (no heartbeat/completion): jobId={}, ticker={}, type={}, worker={}",
                    jobId, job.ticker, job.type, job.workerIdentityKey);
            // Re-queue with a new job_id
            DataRequest requeued = job.request.toBuilder()
                    .setJobId(UUID.randomUUID().toString())
                    .build();
            pendingJobs.add(requeued);
            dispatchPending();
        }
    }
    // ── Helpers ──────────────────────────────────────────────────────────────
    /** Extract exchange name from ticker, e.g. "BTC/USDT.BINANCE" → "BINANCE" */
    private static String exchangeOf(String ticker) {
        int dot = ticker.lastIndexOf('.');
        return dot >= 0 ? ticker.substring(dot + 1).toUpperCase() : "";
    }
    /** Find and remove a free worker that supports the given exchange. */
    private WorkerInfo findFreeWorker(String exchange) {
        for (WorkerInfo w : freeWorkers) {
            if (exchange.isEmpty() || w.exchanges.contains(exchange)) {
                freeWorkers.remove(w);
                return w;
            }
        }
        return null;
    }
    private static String bytesToHex(byte[] bytes) {
        StringBuilder sb = new StringBuilder();
        for (byte b : bytes) sb.append(String.format("%02x", b));
        return sb.toString();
    }
    @Override
    public void close() {
        stop();
    }
    // ── Inner types ──────────────────────────────────────────────────────────
    private static class WorkerInfo {
        byte[] identity;
        final String identityKey;
        Set<String> exchanges;
        WorkerInfo(byte[] identity, String identityKey, Set<String> exchanges) {
            this.identity = identity;
            this.identityKey = identityKey;
            this.exchanges = exchanges;
        }
    }
    private static class ActiveJob {
        final byte[] workerIdentity;
        final String workerIdentityKey;
        final DataRequest request;
        final String ticker;
        final DataRequest.RequestType type;
        long lastHeartbeat;
        ActiveJob(byte[] workerIdentity, String workerIdentityKey,
                  DataRequest request, String ticker, DataRequest.RequestType type) {
            this.workerIdentity = workerIdentity;
            this.workerIdentityKey = workerIdentityKey;
            this.request = request;
            this.ticker = ticker;
            this.type = type;
            this.lastHeartbeat = System.currentTimeMillis();
        }
    }
 }
--- a/flink/src/main/java/com/dexorder/flink/ingestor/IngestorWorkQueue.java
+++ b/flink/src/main/java/com/dexorder/flink/ingestor/IngestorWorkQueue.java
@@ -119,7 +119,7 @@ public class IngestorWorkQueue {
            String exchangePrefix = extractExchangePrefix(ticker);
            boolean sent = zmqManager.sendTopicMessage(
-                ZmqChannelManager.Channel.INGESTOR_WORK_QUEUE,
+                ZmqChannelManager.Channel.INGESTOR_BROKER,
                exchangePrefix,
                PROTOCOL_VERSION,
                MSG_TYPE_DATA_REQUEST,
--- a/flink/src/main/java/com/dexorder/flink/ingestor/RealtimeSubscriptionManager.java
+++ b/flink/src/main/java/com/dexorder/flink/ingestor/RealtimeSubscriptionManager.java
@@ -0,0 +1,204 @@
 package com.dexorder.flink.ingestor;
 import com.dexorder.flink.zmq.ZmqChannelManager;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.zeromq.ZMQ;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 /**
 * Monitors XPUB subscription events from the relay and manages realtime ingestor lifecycle.
 *
 * This class is the <em>sole owner</em> of the MARKET_DATA_PUB XPUB socket. All outbound
 * publishes from other threads (e.g., HistoryNotificationForwarder, RealtimeOHLCPublisher)
 * must go through {@link #enqueuePublish(byte[]...)} so they are sent from the single loop
 * thread — ZMQ sockets are not thread-safe.
 *
 * Topic format: {@code {ticker}|ohlc:{period_seconds}}
 * Example:      {@code BTC/USDT.BINANCE|ohlc:60}
 *
 * Reference counting:
 *   tickerRefs — across all periods for a ticker; 0→1 triggers ingestor activation
 *   topicRefs  — per (ticker, period); consulted by RealtimeOHLCPublisher to filter output
 */
 public class RealtimeSubscriptionManager implements AutoCloseable {
    private static final Logger LOG = LoggerFactory.getLogger(RealtimeSubscriptionManager.class);
    private static final Pattern TOPIC_PATTERN = Pattern.compile("^(.+)\\|ohlc:(\\d+)$");
    private final ZmqChannelManager zmqManager;
    private final ZMQ.Socket xpubSocket;
    private final IngestorBroker broker;
    /** Per-ticker reference count (across all subscribed periods for that ticker) */
    private final Map<String, Integer> tickerRefs = new HashMap<>();
    /** Per-topic reference count (ticker|ohlc:period → subscriber count) */
    private final Map<String, Integer> topicRefs = new HashMap<>();
    /**
     * Thread-safe outbound publish queue.
     * Each entry is one multi-frame message: {@code byte[][] frames}.
     */
    private final ConcurrentLinkedQueue<byte[][]> publishQueue = new ConcurrentLinkedQueue<>();
    private volatile boolean running;
    private Thread thread;
    public RealtimeSubscriptionManager(ZmqChannelManager zmqManager, IngestorBroker broker) {
        this.zmqManager = zmqManager;
        this.xpubSocket = zmqManager.getSocket(ZmqChannelManager.Channel.MARKET_DATA_PUB);
        this.broker = broker;
    }
    /**
     * Queue a multi-frame message for publication on MARKET_DATA_PUB.
     * Thread-safe — may be called from any thread (HistoryNotificationForwarder,
     * RealtimeOHLCPublisher, etc.).
     */
    public void enqueuePublish(byte[]... frames) {
        publishQueue.add(frames);
    }
    /**
     * Returns the current subscriber count for a topic.
     * Thread-safe for reads (value is written only from the loop thread but read from others).
     */
    public int getTopicRefCount(String topic) {
        return topicRefs.getOrDefault(topic, 0);
    }
    public void start() {
        if (running) {
            LOG.warn("RealtimeSubscriptionManager already running");
            return;
        }
        running = true;
        thread = new Thread(this::subscriptionLoop, "RealtimeSubscriptionManager");
        thread.setDaemon(false);
        thread.start();
        LOG.info("RealtimeSubscriptionManager started");
    }
    public void stop() {
        running = false;
        if (thread != null) {
            thread.interrupt();
            try {
                thread.join(5000);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
        }
        LOG.info("RealtimeSubscriptionManager stopped");
    }
    private void subscriptionLoop() {
        // Build a poller so we can block-wait rather than busy-spin
        ZMQ.Poller poller = zmqManager.createPoller(1);
        poller.register(xpubSocket, ZMQ.Poller.POLLIN);
        LOG.info("RealtimeSubscriptionManager loop running");
        while (running) {
            try {
                // 1. Flush any queued outbound messages before blocking
                byte[][] frames;
                while ((frames = publishQueue.poll()) != null) {
                    sendFrames(frames);
                }
                // 2. Wait up to 50ms for a subscription event
                poller.poll(50);
                // 3. Drain all available subscription events
                if (poller.pollin(0)) {
                    byte[] event;
                    while ((event = xpubSocket.recv(ZMQ.DONTWAIT)) != null) {
                        if (event.length > 0) {
                            processSubscriptionEvent(event);
                        }
                    }
                }
            } catch (Exception e) {
                if (running) {
                    LOG.error("Error in subscription loop", e);
                }
            }
        }
        LOG.info("RealtimeSubscriptionManager loop exited");
    }
    private void sendFrames(byte[][] frames) {
        for (int i = 0; i < frames.length; i++) {
            if (i < frames.length - 1) {
                xpubSocket.sendMore(frames[i]);
            } else {
                xpubSocket.send(frames[i], 0);
            }
        }
    }
    private void processSubscriptionEvent(byte[] event) {
        // XPUB subscription frame: first byte is 0x01 (subscribe) or 0x00 (unsubscribe);
        // remaining bytes are the raw topic string.
        boolean isSubscribe = event[0] == 0x01;
        String topic = new String(event, 1, event.length - 1, ZMQ.CHARSET);
        Matcher m = TOPIC_PATTERN.matcher(topic);
        if (!m.matches()) {
            // Not a realtime OHLC topic — e.g. RESPONSE: or HISTORY_READY: prefixes
            LOG.debug("Ignoring subscription event for non-realtime topic: action={}, topic={}",
                    isSubscribe ? "subscribe" : "unsubscribe", topic);
            return;
        }
        String ticker = m.group(1);
        LOG.info("Subscription event: action={}, topic={}", isSubscribe ? "subscribe" : "unsubscribe", topic);
        if (isSubscribe) {
            handleSubscribe(ticker, topic);
        } else {
            handleUnsubscribe(ticker, topic);
        }
    }
    private void handleSubscribe(String ticker, String topic) {
        int newTopicRef = topicRefs.merge(topic, 1, Integer::sum);
        LOG.debug("topicRefs[{}]={}", topic, newTopicRef);
        int newTickerRef = tickerRefs.merge(ticker, 1, Integer::sum);
        if (newTickerRef == 1) {
            LOG.info("First subscriber for ticker={} — submitting realtime request", ticker);
            broker.submitRealtimeRequest(ticker);
        }
        LOG.debug("tickerRefs[{}]={}", ticker, newTickerRef);
    }
    private void handleUnsubscribe(String ticker, String topic) {
        int newTopicRef = topicRefs.merge(topic, -1, Integer::sum);
        if (newTopicRef <= 0) {
            topicRefs.remove(topic);
        }
        LOG.debug("topicRefs[{}]={}", topic, newTopicRef);
        int newTickerRef = tickerRefs.merge(ticker, -1, Integer::sum);
        if (newTickerRef <= 0) {
            tickerRefs.remove(ticker);
            LOG.info("Last subscriber for ticker={} left — stopping realtime jobs", ticker);
            broker.stopRealtimeJobsForTicker(ticker);
        }
        LOG.debug("tickerRefs[{}]={}", ticker, newTickerRef);
    }
    @Override
    public void close() {
        stop();
    }
 }
--- a/flink/src/main/java/com/dexorder/flink/publisher/HistoryNotificationForwarder.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/HistoryNotificationForwarder.java
@@ -6,14 +6,24 @@ import org.zeromq.SocketType;
 import org.zeromq.ZContext;
 import org.zeromq.ZMQ;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.Consumer;
 /**
 * Runs in the job manager. Pulls notifications from task managers (via PUSH/PULL)
- * and republishes them on the MARKET_DATA_PUB socket that the relay subscribes to.
+ * and enqueues them for publication on MARKET_DATA_PUB via the provided publish callback.
 *
 * The publish callback must be thread-safe (e.g., RealtimeSubscriptionManager.enqueuePublish).
 * Direct socket access is avoided here because the MARKET_DATA_PUB XPUB socket is owned
 * exclusively by RealtimeSubscriptionManager to satisfy ZMQ's single-thread-per-socket rule.
 *
 * Flow:
 *   Task manager HistoryNotificationPublisher  →  PUSH
 *                                               ↓
- *   Job manager HistoryNotificationForwarder   PULL  →  MARKET_DATA_PUB
+ *   Job manager HistoryNotificationForwarder   PULL  →  publishCallback (queue)
 *                                                        ↓ (RealtimeSubscriptionManager loop)
 *                                                        MARKET_DATA_PUB
 *                                                        ↓
 *   Relay (XSUB) → Relay (XPUB) → Clients
 */
@@ -21,17 +31,17 @@ public class HistoryNotificationForwarder implements AutoCloseable {
    private static final Logger LOG = LoggerFactory.getLogger(HistoryNotificationForwarder.class);
    private final ZMQ.Socket pullSocket;
-    private final ZMQ.Socket pubSocket;
+    private final Consumer<byte[][]> publishCallback;
    private final ZContext context;
    private volatile boolean running = true;
    private Thread thread;
    /**
-     * @param pullPort   Port to bind PULL socket on (task managers connect PUSH here)
+     * @param pullPort        Port to bind PULL socket on (task managers connect PUSH here)
-     * @param pubSocket  Existing MARKET_DATA_PUB socket from ZmqChannelManager
+     * @param publishCallback Thread-safe callback to enqueue outbound multi-frame messages
     */
-    public HistoryNotificationForwarder(int pullPort, ZMQ.Socket pubSocket) {
+    public HistoryNotificationForwarder(int pullPort, Consumer<byte[][]> publishCallback) {
-        this.pubSocket = pubSocket;
+        this.publishCallback = publishCallback;
        this.context = new ZContext();
        this.pullSocket = context.createSocket(SocketType.PULL);
        this.pullSocket.setRcvHWM(10000);
@@ -53,32 +63,24 @@ public class HistoryNotificationForwarder implements AutoCloseable {
        pullSocket.setReceiveTimeOut(200); // ms, so we can check running flag
        while (running) {
            // Receive all frames of a multi-part message and forward to PUB
            byte[] frame = pullSocket.recv(0);
            if (frame == null) {
-                continue; // timeout, check running flag
+                continue; // timeout — check running flag
            }
-            boolean more = pullSocket.hasReceiveMore();
+            // Collect all frames of the multi-part message, then enqueue atomically
-            if (more) {
+            List<byte[]> frames = new ArrayList<>();
-                pubSocket.sendMore(frame);
+            frames.add(frame);
            } else {
                pubSocket.send(frame, 0);
                continue;
            }
-            // Receive remaining frames
+            while (pullSocket.hasReceiveMore()) {
-            while (more) {
+                byte[] next = pullSocket.recv(0);
-                frame = pullSocket.recv(0);
+                if (next != null) {
-                more = pullSocket.hasReceiveMore();
+                    frames.add(next);
                if (more) {
                    pubSocket.sendMore(frame);
                } else {
                    pubSocket.send(frame, 0);
                }
            }
-            LOG.debug("Forwarded notification to MARKET_DATA_PUB");
+            publishCallback.accept(frames.toArray(new byte[0][]));
            LOG.debug("Enqueued notification ({} frames) for MARKET_DATA_PUB", frames.size());
        }
        LOG.info("Notification forwarder loop stopped");
--- a/flink/src/main/java/com/dexorder/flink/publisher/HistoryNotificationFunction.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/HistoryNotificationFunction.java
@@ -64,8 +64,13 @@ public class HistoryNotificationFunction extends ProcessFunction<OHLCBatchWrappe
        String status = batch.getStatus();
        int rowCount = batch.getRowCount();
-        LOG.info("Processing OHLCBatch: request_id={}, status={}, rows={}",
+        LOG.info("Processing OHLCBatch: request_id={}, status={}, rows={}, isLastPage={}",
-                requestId, status, rowCount);
+                requestId, status, rowCount, batch.isLastPage());
        // Intermediate pages: data is written to Iceberg but no notification yet
        if (!batch.isLastPage()) {
            return;
        }
        // Determine Iceberg table name based on period
        String tableName = getIcebergTableName(ticker, periodSeconds);
--- a/flink/src/main/java/com/dexorder/flink/publisher/OHLCBatchDeserializer.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/OHLCBatchDeserializer.java
@@ -87,7 +87,8 @@ public class OHLCBatchDeserializer implements DeserializationSchema<OHLCBatchWra
                meta.getEndTime(),
                status,
                meta.hasErrorMessage() ? meta.getErrorMessage() : null,
-                rows
+                rows,
                meta.getIsLastPage()
        );
    }
--- a/flink/src/main/java/com/dexorder/flink/publisher/OHLCBatchWrapper.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/OHLCBatchWrapper.java
@@ -19,6 +19,7 @@ public class OHLCBatchWrapper implements Serializable {
    private final String status; // OK, NOT_FOUND, ERROR
    private final String errorMessage;
    private final List<OHLCRow> rows;
    private final boolean isLastPage;
    public OHLCBatchWrapper(
            String requestId,
@@ -29,7 +30,8 @@ public class OHLCBatchWrapper implements Serializable {
            long endTime,
            String status,
            String errorMessage,
-            List<OHLCRow> rows
+            List<OHLCRow> rows,
            boolean isLastPage
    ) {
        this.requestId = requestId;
        this.clientId = clientId;
@@ -40,6 +42,7 @@ public class OHLCBatchWrapper implements Serializable {
        this.status = status;
        this.errorMessage = errorMessage;
        this.rows = rows;
        this.isLastPage = isLastPage;
    }
    public String getRequestId() {
@@ -94,6 +97,10 @@ public class OHLCBatchWrapper implements Serializable {
        return "OK".equals(status);
    }
    public boolean isLastPage() {
        return isLastPage;
    }
    @Override
    public String toString() {
        return "OHLCBatchWrapper{" +
@@ -103,6 +110,7 @@ public class OHLCBatchWrapper implements Serializable {
                ", periodSeconds=" + periodSeconds +
                ", status='" + status + '\'' +
                ", rowCount=" + getRowCount() +
                ", isLastPage=" + isLastPage +
                '}';
    }
--- a/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBar.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBar.java
@@ -0,0 +1,73 @@
 package com.dexorder.flink.publisher;
 import java.io.Serializable;
 /**
 * A single completed OHLC bar for a given ticker and period.
 * Output type of RealtimeBarFunction, input type of RealtimeBarPublisher.
 */
 public class RealtimeBar implements Serializable {
    private static final long serialVersionUID = 1L;
    private String ticker;
    /** Period in seconds (e.g., 60, 300, 3600) */
    private int periodSeconds;
    /** Window start timestamp in milliseconds since epoch */
    private long windowStartMs;
    /** Scaled integer price values (same precision as source Tick) */
    private long open;
    private long high;
    private long low;
    private long close;
    /** Summed base amount across ticks in this window */
    private long volume;
    /** Number of ticks in this window */
    private int tickCount;
    public RealtimeBar() {}
    public RealtimeBar(String ticker, int periodSeconds, long windowStartMs,
                       long open, long high, long low, long close, long volume, int tickCount) {
        this.ticker = ticker;
        this.periodSeconds = periodSeconds;
        this.windowStartMs = windowStartMs;
        this.open = open;
        this.high = high;
        this.low = low;
        this.close = close;
        this.volume = volume;
        this.tickCount = tickCount;
    }
    public String getTicker() { return ticker; }
    public int getPeriodSeconds() { return periodSeconds; }
    public long getWindowStartMs() { return windowStartMs; }
    public long getOpen() { return open; }
    public long getHigh() { return high; }
    public long getLow() { return low; }
    public long getClose() { return close; }
    public long getVolume() { return volume; }
    public int getTickCount() { return tickCount; }
    public void setTicker(String ticker) { this.ticker = ticker; }
    public void setPeriodSeconds(int periodSeconds) { this.periodSeconds = periodSeconds; }
    public void setWindowStartMs(long windowStartMs) { this.windowStartMs = windowStartMs; }
    public void setOpen(long open) { this.open = open; }
    public void setHigh(long high) { this.high = high; }
    public void setLow(long low) { this.low = low; }
    public void setClose(long close) { this.close = close; }
    public void setVolume(long volume) { this.volume = volume; }
    public void setTickCount(int tickCount) { this.tickCount = tickCount; }
    /** ZMQ topic for this bar: e.g., "BTC/USDT.BINANCE|ohlc:60" */
    public String topic() {
        return ticker + "|ohlc:" + periodSeconds;
    }
    @Override
    public String toString() {
        return "RealtimeBar{ticker='" + ticker + "', period=" + periodSeconds +
                "s, windowStart=" + windowStartMs + ", O=" + open + " H=" + high +
                " L=" + low + " C=" + close + ", ticks=" + tickCount + '}';
    }
 }
--- a/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBarFunction.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBarFunction.java
@@ -0,0 +1,116 @@
 package com.dexorder.flink.publisher;
 import org.apache.flink.api.common.functions.RichFlatMapFunction;
 import org.apache.flink.api.common.state.MapState;
 import org.apache.flink.api.common.state.MapStateDescriptor;
 import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
 import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
 import org.apache.flink.configuration.Configuration;
 import org.apache.flink.util.Collector;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 /**
 * Accumulates ticks into OHLC bars for each configured period.
 *
 * Keyed by ticker. Maintains per-period accumulators in MapState.
 * Uses a "lazy boundary" approach: a new window is detected when a tick arrives after
 * the previous window's end time (based on processing clock). The completed bar is
 * emitted immediately when the boundary is crossed, so bars are delayed by at most
 * one tick interval (~10s for realtime polling).
 *
 * Periods are configurable at construction time. All configured periods are computed
 * for every ticker receiving ticks; the ZMQ publisher filters to active subscriptions.
 *
 * Accumulator layout (long[7]):
 *   [0] open
 *   [1] high
 *   [2] low
 *   [3] close
 *   [4] volume (sum of base amount)
 *   [5] windowStartMs (epoch ms)
 *   [6] tickCount
 */
 public class RealtimeBarFunction extends RichFlatMapFunction<TickWrapper, RealtimeBar> {
    private static final Logger LOG = LoggerFactory.getLogger(RealtimeBarFunction.class);
    private static final long serialVersionUID = 1L;
    private final int[] periods;
    private transient MapState<Integer, long[]> accumState;
    /**
     * @param periods  Period lengths in seconds (e.g., 60, 300, 900, 3600)
     */
    public RealtimeBarFunction(int[] periods) {
        this.periods = periods;
    }
    @Override
    public void open(Configuration parameters) {
        MapStateDescriptor<Integer, long[]> desc = new MapStateDescriptor<>(
                "ohlcAccum",
                BasicTypeInfo.INT_TYPE_INFO,
                PrimitiveArrayTypeInfo.LONG_PRIMITIVE_ARRAY_TYPE_INFO
        );
        accumState = getRuntimeContext().getMapState(desc);
    }
    @Override
    public void flatMap(TickWrapper tick, Collector<RealtimeBar> out) throws Exception {
        if (tick == null) return;
        long nowMs = System.currentTimeMillis();
        for (int period : periods) {
            long periodMs = period * 1000L;
            long windowStart = (nowMs / periodMs) * periodMs;
            long[] accum = accumState.get(period);
            if (accum == null) {
                // First tick for this period
                accumState.put(period, openWindow(tick, windowStart));
            } else if (accum[5] != windowStart) {
                // Window boundary crossed — emit completed bar then start fresh
                if (accum[6] > 0) {
                    out.collect(toBar(tick.getTicker(), period, accum));
                    LOG.debug("Emitted bar: ticker={}, period={}s, windowStart={}, ticks={}",
                            tick.getTicker(), period, accum[5], accum[6]);
                }
                accumState.put(period, openWindow(tick, windowStart));
            } else {
                // Same window — update
                accum[1] = Math.max(accum[1], tick.getPrice()); // high
                accum[2] = Math.min(accum[2], tick.getPrice()); // low
                accum[3] = tick.getPrice();                      // close
                accum[4] += tick.getAmount();                    // volume
                accum[6]++;                                       // tick count
                accumState.put(period, accum);
            }
        }
    }
    private static long[] openWindow(TickWrapper tick, long windowStart) {
        return new long[]{
            tick.getPrice(), // open
            tick.getPrice(), // high
            tick.getPrice(), // low
            tick.getPrice(), // close
            tick.getAmount(), // volume
            windowStart,
            1L               // tickCount
        };
    }
    private static RealtimeBar toBar(String ticker, int period, long[] accum) {
        return new RealtimeBar(
                ticker, period,
                accum[5],           // windowStartMs
                accum[0], accum[1], accum[2], accum[3], // O H L C
                accum[4],           // volume
                (int) accum[6]      // tickCount
        );
    }
 }
--- a/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBarPublisher.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/RealtimeBarPublisher.java
@@ -0,0 +1,91 @@
 package com.dexorder.flink.publisher;
 import com.dexorder.proto.OHLC;
 import org.apache.flink.configuration.Configuration;
 import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.zeromq.SocketType;
 import org.zeromq.ZContext;
 import org.zeromq.ZMQ;
 /**
 * Flink sink that publishes completed realtime OHLC bars to clients.
 *
 * Connects a ZMQ PUSH socket to the job manager's notification PULL endpoint.
 * The HistoryNotificationForwarder (already running on the job manager) receives these
 * frames and enqueues them to RealtimeSubscriptionManager, which publishes them on
 * the MARKET_DATA_PUB XPUB socket. Clients subscribed to the matching topic receive the bar.
 *
 * Wire format (matches HistoryNotificationPublisher):
 *   Frame 1: topic bytes  (e.g., "BTC/USDT.BINANCE|ohlc:60")
 *   Frame 2: [0x01]       (protocol version)
 *   Frame 3: [0x04][OHLC protobuf bytes]  (type 0x04 = OHLC single bar)
 *
 * Parallelism MUST be 1 (same as the rest of the notification pipeline).
 */
 public class RealtimeBarPublisher extends RichSinkFunction<RealtimeBar> {
    private static final Logger LOG = LoggerFactory.getLogger(RealtimeBarPublisher.class);
    private static final long serialVersionUID = 1L;
    private static final byte PROTOCOL_VERSION = 0x01;
    private static final byte MSG_TYPE_OHLC = 0x04;
    private final String jobManagerPullEndpoint;
    private transient ZContext context;
    private transient ZMQ.Socket pushSocket;
    public RealtimeBarPublisher(String jobManagerPullEndpoint) {
        this.jobManagerPullEndpoint = jobManagerPullEndpoint;
    }
    @Override
    public void open(Configuration parameters) {
        context = new ZContext();
        pushSocket = context.createSocket(SocketType.PUSH);
        pushSocket.setLinger(1000);
        pushSocket.setSndHWM(10000);
        pushSocket.connect(jobManagerPullEndpoint);
        LOG.info("RealtimeBarPublisher PUSH connected to {}", jobManagerPullEndpoint);
    }
    @Override
    public void invoke(RealtimeBar bar, Context context) {
        try {
            // Build OHLC proto — timestamp in nanoseconds (bar uses ms, convert)
            OHLC ohlc = OHLC.newBuilder()
                    .setTimestamp(bar.getWindowStartMs() * 1_000_000L) // ms → ns
                    .setTicker(bar.getTicker())
                    .setOpen(bar.getOpen())
                    .setHigh(bar.getHigh())
                    .setLow(bar.getLow())
                    .setClose(bar.getClose())
                    .setVolume(bar.getVolume())
                    .build();
            byte[] protoBytes = ohlc.toByteArray();
            byte[] messageFrame = new byte[1 + protoBytes.length];
            messageFrame[0] = MSG_TYPE_OHLC;
            System.arraycopy(protoBytes, 0, messageFrame, 1, protoBytes.length);
            String topic = bar.topic();
            pushSocket.sendMore(topic.getBytes(ZMQ.CHARSET));
            pushSocket.sendMore(new byte[]{PROTOCOL_VERSION});
            pushSocket.send(messageFrame, 0);
            LOG.debug("Published realtime bar: topic={}, ticks={}", topic, bar.getTickCount());
        } catch (Exception e) {
            LOG.error("Failed to publish realtime bar: ticker={}, period={}",
                    bar.getTicker(), bar.getPeriodSeconds(), e);
        }
    }
    @Override
    public void close() {
        if (pushSocket != null) pushSocket.close();
        if (context != null) context.close();
        LOG.info("RealtimeBarPublisher closed");
    }
 }
--- a/flink/src/main/java/com/dexorder/flink/publisher/TickDeserializer.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/TickDeserializer.java
@@ -0,0 +1,69 @@
 package com.dexorder.flink.publisher;
 import com.dexorder.proto.Tick;
 import org.apache.flink.api.common.serialization.DeserializationSchema;
 import org.apache.flink.api.common.typeinfo.TypeInformation;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.IOException;
 /**
 * Kafka deserializer for Tick protobuf messages from the market-tick topic.
 *
 * Wire format: [0x01 version][0x03 TICK type][Tick protobuf bytes]
 */
 public class TickDeserializer implements DeserializationSchema<TickWrapper> {
    private static final Logger LOG = LoggerFactory.getLogger(TickDeserializer.class);
    private static final long serialVersionUID = 1L;
    private static final byte PROTOCOL_VERSION = 0x01;
    private static final byte MSG_TYPE_TICK = 0x03;
    @Override
    public TickWrapper deserialize(byte[] message) throws IOException {
        try {
            if (message.length < 2) {
                throw new IOException("Message too short: " + message.length + " bytes");
            }
            if (message[0] != PROTOCOL_VERSION) {
                throw new IOException("Unsupported protocol version: 0x" + Integer.toHexString(message[0] & 0xFF));
            }
            if (message[1] != MSG_TYPE_TICK) {
                throw new IOException("Unexpected message type: 0x" + Integer.toHexString(message[1] & 0xFF));
            }
            byte[] payload = new byte[message.length - 2];
            System.arraycopy(message, 2, payload, 0, payload.length);
            Tick tick = Tick.parseFrom(payload);
            return new TickWrapper(
                    tick.getTicker(),
                    tick.getTradeId(),
                    tick.getTimestamp(),
                    tick.getPrice(),
                    tick.getAmount(),
                    tick.getQuoteAmount(),
                    tick.getTakerBuy()
            );
        } catch (Exception e) {
            LOG.warn("Failed to deserialize Tick, skipping: {}", e.getMessage());
            // Return null; Flink's KafkaSource skips nulls via filter
            return null;
        }
    }
    @Override
    public boolean isEndOfStream(TickWrapper nextElement) {
        return false;
    }
    @Override
    public TypeInformation<TickWrapper> getProducedType() {
        return TypeInformation.of(TickWrapper.class);
    }
 }
--- a/flink/src/main/java/com/dexorder/flink/publisher/TickWrapper.java
+++ b/flink/src/main/java/com/dexorder/flink/publisher/TickWrapper.java
@@ -0,0 +1,58 @@
 package com.dexorder.flink.publisher;
 import java.io.Serializable;
 /**
 * Flink-serializable wrapper for a single Tick.
 * Fields mirror the Tick protobuf, using primitives to avoid proto-class serialization issues.
 */
 public class TickWrapper implements Serializable {
    private static final long serialVersionUID = 1L;
    private String ticker;
    private String tradeId;
    /** Timestamp in nanoseconds since epoch */
    private long timestamp;
    /** Price as scaled integer */
    private long price;
    /** Base amount as scaled integer */
    private long amount;
    /** Quote amount as scaled integer */
    private long quoteAmount;
    private boolean takerBuy;
    public TickWrapper() {}
    public TickWrapper(String ticker, String tradeId, long timestamp,
                       long price, long amount, long quoteAmount, boolean takerBuy) {
        this.ticker = ticker;
        this.tradeId = tradeId;
        this.timestamp = timestamp;
        this.price = price;
        this.amount = amount;
        this.quoteAmount = quoteAmount;
        this.takerBuy = takerBuy;
    }
    public String getTicker() { return ticker; }
    public String getTradeId() { return tradeId; }
    public long getTimestamp() { return timestamp; }
    public long getPrice() { return price; }
    public long getAmount() { return amount; }
    public long getQuoteAmount() { return quoteAmount; }
    public boolean isTakerBuy() { return takerBuy; }
    public void setTicker(String ticker) { this.ticker = ticker; }
    public void setTradeId(String tradeId) { this.tradeId = tradeId; }
    public void setTimestamp(long timestamp) { this.timestamp = timestamp; }
    public void setPrice(long price) { this.price = price; }
    public void setAmount(long amount) { this.amount = amount; }
    public void setQuoteAmount(long quoteAmount) { this.quoteAmount = quoteAmount; }
    public void setTakerBuy(boolean takerBuy) { this.takerBuy = takerBuy; }
    @Override
    public String toString() {
        return "TickWrapper{ticker='" + ticker + "', tradeId='" + tradeId +
                "', timestamp=" + timestamp + ", price=" + price + '}';
    }
 }
--- a/flink/src/main/java/com/dexorder/flink/zmq/ZmqChannelManager.java
+++ b/flink/src/main/java/com/dexorder/flink/zmq/ZmqChannelManager.java
@@ -13,7 +13,14 @@ import java.util.Map;
 /**
 * Manages all ZeroMQ channels for the Flink application.
- * Each channel is bound to a specific port and socket type.
+ *
 * Port layout:
 *   5558  XPUB  MARKET_DATA_PUB   — market data + notifications to clients (via relay XSUB)
 *                                   XPUB exposes subscription frames so Flink can detect
 *                                   which realtime topics clients are interested in.
 *   5561  PULL  (internal)         — task manager → job manager notifications (unchanged)
 *   5566  PULL  CLIENT_REQUEST     — receives forwarded SubmitHistoricalRequest from relay PUSH
 *   5567  ROUTER INGESTOR_BROKER   — exclusive work queue; ingestors connect with DEALER
 */
 public class ZmqChannelManager implements Closeable {
    private static final Logger LOG = LoggerFactory.getLogger(ZmqChannelManager.class);
@@ -23,8 +30,9 @@ public class ZmqChannelManager implements Closeable {
    private final AppConfig config;
    public enum Channel {
        INGESTOR_WORK_QUEUE,
        MARKET_DATA_PUB,
        CLIENT_REQUEST,
        INGESTOR_BROKER,
    }
    public ZmqChannelManager(AppConfig config) {
@@ -41,20 +49,33 @@ public class ZmqChannelManager implements Closeable {
        LOG.info("Initializing ZeroMQ channels on {}", bindAddress);
-        // 1. Ingestor Work Queue - PUB socket for topic-based work distribution (exchange prefix filtering)
+        // 1. Market Data Publication — XPUB so subscription events are visible to Flink
        //    Relay's XSUB connects here to proxy data to clients.
        //    Subscription frames from relay (forwarded from clients) arrive as readable messages.
        ZMQ.Socket marketDataSocket = context.createSocket(SocketType.XPUB);
        marketDataSocket.setXpubVerbose(true);   // emit every sub/unsub, not just first/last
        marketDataSocket.setLinger(1000);
        marketDataSocket.setSndHWM(10000);
        marketDataSocket.setRcvHWM(10000);
        String marketDataEndpoint = bindAddress + ":" + config.getMarketDataPubPort();
        marketDataSocket.bind(marketDataEndpoint);
        sockets.put(Channel.MARKET_DATA_PUB.name(), marketDataSocket);
        LOG.info("Bound Market Data Publication (XPUB) to {}", marketDataEndpoint);
        // 2. Client Request Pull — receives SubmitHistoricalRequest forwarded by relay PUSH
        createAndBind(
-            Channel.INGESTOR_WORK_QUEUE,
+            Channel.CLIENT_REQUEST,
-            SocketType.PUB,
+            SocketType.PULL,
-            bindAddress + ":" + config.getIngestorWorkQueuePort(),
+            bindAddress + ":" + config.getFlinkRequestPullPort(),
-            "Ingestor Work Queue (PUB)"
+            "Client Request (PULL)"
        );
-        // 2. Market Data Publication - PUB socket for market data streaming and HistoryReadyNotification
+        // 3. Ingestor Broker — ROUTER for exclusive work dispatch to ingestor DEALER workers
        createAndBind(
-            Channel.MARKET_DATA_PUB,
+            Channel.INGESTOR_BROKER,
-            SocketType.PUB,
+            SocketType.ROUTER,
-            bindAddress + ":" + config.getMarketDataPubPort(),
+            bindAddress + ":" + config.getIngestorBrokerPort(),
-            "Market Data Publication (PUB)"
+            "Ingestor Broker (ROUTER)"
        );
        LOG.info("All ZeroMQ channels initialized successfully");
@@ -63,15 +84,10 @@ public class ZmqChannelManager implements Closeable {
    private void createAndBind(Channel channel, SocketType socketType, String endpoint, String description) {
        try {
            ZMQ.Socket socket = context.createSocket(socketType);
-
+            socket.setLinger(1000);
-            // Set socket options
+            socket.setSndHWM(10000);
-            socket.setLinger(1000); // 1 second linger on close
+            socket.setRcvHWM(10000);
            socket.setSndHWM(10000); // High water mark for outbound messages
            socket.setRcvHWM(10000); // High water mark for inbound messages
            // Bind the socket
            socket.bind(endpoint);
            sockets.put(channel.name(), socket);
            LOG.info("Bound {} to {}", description, endpoint);
        } catch (Exception e) {
@@ -80,6 +96,13 @@ public class ZmqChannelManager implements Closeable {
        }
    }
    /**
     * Create a ZMQ Poller backed by this manager's context.
     */
    public ZMQ.Poller createPoller(int size) {
        return context.getContext().poller(size);
    }
    /**
     * Get a socket by channel type.
     */
@@ -92,18 +115,11 @@ public class ZmqChannelManager implements Closeable {
    }
    /**
-     * Send a message on the specified channel.
+     * Send a message on a channel (no topic prefix — for PULL/PUSH or direct sends).
     *
     * @param channel The channel to send on
     * @param versionByte Protocol version byte
     * @param messageTypeByte Message type ID byte
     * @param protobufData Serialized protobuf message
     * @return true if sent successfully
     */
    public boolean sendMessage(Channel channel, byte versionByte, byte messageTypeByte, byte[] protobufData) {
        ZMQ.Socket socket = getSocket(channel);
        // Send as two frames: [version byte] [type byte + protobuf data]
        boolean sentFrame1 = socket.send(new byte[]{versionByte}, ZMQ.SNDMORE);
        if (!sentFrame1) {
            LOG.error("Failed to send version frame on channel {}", channel);
@@ -124,27 +140,18 @@ public class ZmqChannelManager implements Closeable {
    }
    /**
-     * Send a message with a topic prefix (for PUB sockets).
+     * Send a topic-prefixed message (for XPUB market data publishing).
-     *
+     * Frame layout: [topic][version][type+payload]
     * @param channel The channel to send on
     * @param topic Topic string for subscription filtering
     * @param versionByte Protocol version byte
     * @param messageTypeByte Message type ID byte
     * @param protobufData Serialized protobuf message
     * @return true if sent successfully
     */
    public boolean sendTopicMessage(Channel channel, String topic, byte versionByte, byte messageTypeByte, byte[] protobufData) {
        ZMQ.Socket socket = getSocket(channel);
-        // Send as three frames: [topic] [version byte] [type byte + protobuf data]
+        if (!socket.send(topic.getBytes(ZMQ.CHARSET), ZMQ.SNDMORE)) {
        boolean sentTopic = socket.send(topic.getBytes(ZMQ.CHARSET), ZMQ.SNDMORE);
        if (!sentTopic) {
            LOG.error("Failed to send topic frame on channel {}", channel);
            return false;
        }
-        boolean sentFrame1 = socket.send(new byte[]{versionByte}, ZMQ.SNDMORE);
+        if (!socket.send(new byte[]{versionByte}, ZMQ.SNDMORE)) {
        if (!sentFrame1) {
            LOG.error("Failed to send version frame on channel {}", channel);
            return false;
        }
@@ -153,8 +160,7 @@ public class ZmqChannelManager implements Closeable {
        frame2[0] = messageTypeByte;
        System.arraycopy(protobufData, 0, frame2, 1, protobufData.length);
-        boolean sentFrame2 = socket.send(frame2, 0);
+        if (!socket.send(frame2, 0)) {
        if (!sentFrame2) {
            LOG.error("Failed to send message frame on channel {}", channel);
            return false;
        }
@@ -162,6 +168,24 @@ public class ZmqChannelManager implements Closeable {
        return true;
    }
    /**
     * Send a targeted message to a specific DEALER worker via ROUTER.
     * Frame layout: [identity][empty][version][type+payload]
     */
    public boolean sendToWorker(byte[] identity, byte versionByte, byte messageTypeByte, byte[] protobufData) {
        ZMQ.Socket socket = getSocket(Channel.INGESTOR_BROKER);
        if (!socket.send(identity, ZMQ.SNDMORE)) return false;
        if (!socket.send(new byte[0], ZMQ.SNDMORE)) return false;
        if (!socket.send(new byte[]{versionByte}, ZMQ.SNDMORE)) return false;
        byte[] frame = new byte[1 + protobufData.length];
        frame[0] = messageTypeByte;
        System.arraycopy(protobufData, 0, frame, 1, protobufData.length);
        return socket.send(frame, 0);
    }
    @Override
    public void close() {
        LOG.info("Closing ZeroMQ channels");
--- a/gateway/src/auth/authenticator.ts
+++ b/gateway/src/auth/authenticator.ts
@@ -72,7 +72,7 @@ export class Authenticator {
        );
      }
-      const sessionId = `ws_${userId}_${Date.now()}`;
+      const sessionId = `ws_${userId}`;
      return {
        authContext: {
--- a/gateway/src/channels/websocket-handler.ts
+++ b/gateway/src/channels/websocket-handler.ts
@@ -2,12 +2,14 @@ import type { FastifyInstance, FastifyRequest } from 'fastify';
 import type { WebSocket } from '@fastify/websocket';
 import type { Authenticator } from '../auth/authenticator.js';
 import type { AgentHarness, HarnessFactory } from '../harness/agent-harness.js';
 import type { HarnessEvent } from '../harness/harness-events.js';
 import type { InboundMessage } from '../types/messages.js';
 import { randomUUID } from 'crypto';
 import type { SessionRegistry, EventSubscriber, Session } from '../events/index.js';
-import type { OHLCService } from '../services/ohlc-service.js';
+import type { OHLCService, BarUpdateCallback } from '../services/ohlc-service.js';
 import type { SymbolIndexService } from '../services/symbol-index-service.js';
 import type { ContainerManager } from '../k8s/container-manager.js';
 import type { ConversationService } from '../services/conversation-service.js';
 import {
  WorkspaceManager,
  ContainerSync,
@@ -42,6 +44,7 @@ export interface WebSocketHandlerConfig {
  createHarness: HarnessFactory;
  ohlcService?: OHLCService;  // Optional for historical data support
  symbolIndexService?: SymbolIndexService;  // Optional for symbol search
  conversationService?: ConversationService;  // Optional for history replay on reconnect
 }
 /**
@@ -50,10 +53,18 @@ export interface WebSocketHandlerConfig {
 * Handles WebSocket connections for chat and integrates with the event system
 * for container-to-client notifications.
 */
 interface BarSubscription {
  ticker: string;
  periodSeconds: number;
  callback: BarUpdateCallback;
 }
 export class WebSocketHandler {
  private config: WebSocketHandlerConfig;
  private harnesses = new Map<string, AgentHarness>();
  private workspaces = new Map<string, WorkspaceManager>();
  /** Per-session realtime bar subscriptions for cleanup on disconnect */
  private barSubscriptions = new Map<string, BarSubscription[]>();
  constructor(config: WebSocketHandlerConfig) {
    this.config = config;
@@ -106,17 +117,22 @@ export class WebSocketHandler {
    // If container is spinning up, wait for it to be ready before continuing
    if (isSpinningUp) {
-      sendStatus(socket, 'spinning_up', 'Your workspace is starting up, please wait...');
+      sendStatus(socket, 'spinning_up', 'Your personal agent is starting up, please wait...');
      const startupPingInterval = setInterval(() => {
        if (socket.readyState === 1) socket.ping();
      }, 10000);
      const ready = await this.config.containerManager.waitForContainerReady(authContext.userId, 120000);
      clearInterval(startupPingInterval);
      if (!ready) {
-        logger.warn({ userId: authContext.userId }, 'Container failed to become ready within timeout');
+        logger.warn({ userId: authContext.userId }, 'Sandbox failed to become ready within timeout');
-        socket.send(JSON.stringify({ type: 'error', message: 'Workspace failed to start. Please try again later.' }));
+        socket.send(JSON.stringify({ type: 'error', message: 'Agent workspace failed to start. Please try again later.' }));
        socket.close(1011, 'Container startup timeout');
        return;
      }
-      logger.info({ userId: authContext.userId }, 'Container is ready, proceeding with session setup');
+      logger.info({ userId: authContext.userId }, 'Sandbox is ready, proceeding with session setup');
    }
    sendStatus(socket, 'initializing', 'Starting your workspace...');
@@ -241,6 +257,17 @@ export class WebSocketHandler {
        })
      );
      // Replay conversation history so the UI pre-populates on reconnect
      if (this.config.conversationService) {
        const history = await this.config.conversationService.getHistory(
          authContext.userId,
          authContext.sessionId
        );
        if (history.length > 0) {
          socket.send(JSON.stringify({ type: 'conversation_history', messages: history }));
        }
      }
      // Handle messages
      socket.on('message', async (data: Buffer) => {
        try {
@@ -266,15 +293,45 @@ export class WebSocketHandler {
              return;
            }
            // Chunks are streamed via channelAdapter.sendChunk() during handleMessage
            try {
              // Acknowledge receipt immediately so the client can show the seen indicator
              socket.send(JSON.stringify({ type: 'agent_chunk', content: '', done: false }));
-              logger.info('Calling harness.handleMessage');
+              logger.info('Streaming harness response');
-              await harness.handleMessage(inboundMessage);
+              let fatalError = false;
              for await (const event of harness.streamMessage(inboundMessage)) {
                const e = event as HarnessEvent;
                switch (e.type) {
                  case 'chunk':
                    socket.send(JSON.stringify({ type: 'agent_chunk', content: e.content, done: false }));
                    break;
                  case 'tool_call':
                    socket.send(JSON.stringify({ type: 'agent_tool_call', toolName: e.toolName, label: e.label }));
                    break;
                  case 'subagent_tool_call':
                    socket.send(JSON.stringify({ type: 'subagent_tool_call', agentName: e.agentName, toolName: e.toolName, label: e.label }));
                    break;
                  case 'subagent_chunk':
                    socket.send(JSON.stringify({ type: 'subagent_chunk', agentName: e.agentName, content: e.content }));
                    break;
                  case 'image':
                    socket.send(JSON.stringify({ type: 'image', data: e.data, mimeType: e.mimeType, caption: e.caption }));
                    break;
                  case 'error':
                    socket.send(JSON.stringify({ type: 'text', text: `An unrecoverable error occurred in the ${e.source}.` }));
                    if (e.fatal) fatalError = true;
                    break;
                  case 'done':
                    break;
                }
              }
-              // Send done marker after all chunks have been streamed
+              if (fatalError) {
                socket.close(1011, 'Fatal error');
                return;
              }
              // Send done marker after all events have been streamed
              logger.debug('Sending done marker to client');
              socket.send(
                JSON.stringify({
@@ -332,6 +389,17 @@ export class WebSocketHandler {
          await this.config.eventSubscriber.onSessionDisconnect(removedSession);
        }
        // Cleanup realtime bar subscriptions
        const sessionId = authContext.sessionId;
        const subs = this.barSubscriptions.get(sessionId);
        if (subs && this.config.ohlcService) {
          for (const { ticker, periodSeconds, callback } of subs) {
            this.config.ohlcService.unsubscribeFromTicker(ticker, periodSeconds, callback);
          }
          this.barSubscriptions.delete(sessionId);
          logger.info({ sessionId, count: subs.length }, 'Cleaned up realtime bar subscriptions');
        }
        // Cleanup workspace
        await workspace!.shutdown();
        this.workspaces.delete(authContext.sessionId);
@@ -356,6 +424,7 @@ export class WebSocketHandler {
      }, 30000);
    } catch (error) {
      logger.error({ error }, 'Failed to initialize session');
      socket.send(JSON.stringify({ type: 'text', text: 'An unrecoverable error occurred in the agent harness.' }));
      socket.close(1011, 'Internal server error');
      if (workspace) {
        await workspace.shutdown();
@@ -527,19 +596,92 @@ export class WebSocketHandler {
          break;
        }
-        case 'subscribe_bars':
+        case 'subscribe_bars': {
-        case 'unsubscribe_bars':
+          if (!ohlcService || !authContext) {
-          // TODO: Implement real-time subscriptions
+            socket.send(JSON.stringify({
-          socket.send(
+              type: 'subscribe_bars_response',
            JSON.stringify({
              type: `${payload.type}_response`,
              request_id: requestId,
              subscription_id: payload.subscription_id,
              success: false,
-              message: 'Real-time subscriptions not yet implemented',
+              message: 'Realtime service not available',
-            })
+            }));
-          );
+            break;
          }
          const subTicker: string = payload.symbol;
          const subPeriod: number = payload.period_seconds ?? payload.resolution ?? 60;
          const sessionId = authContext.sessionId;
          // Create a per-subscription callback that forwards bars to this socket
          const barCallback: BarUpdateCallback = (bar) => {
            if (socket.readyState !== 1 /* OPEN */) return;
            socket.send(JSON.stringify({
              type: 'bar_update',
              subscription_id: payload.subscription_id,
              ticker: bar.ticker,
              period_seconds: bar.periodSeconds,
              bar: {
                // Convert nanoseconds → seconds for client compatibility
                time: Number(bar.timestamp / 1_000_000_000n),
                open: bar.open,
                high: bar.high,
                low: bar.low,
                close: bar.close,
                volume: bar.volume,
              },
            }));
          };
          ohlcService.subscribeToTicker(subTicker, subPeriod, barCallback);
          // Track for cleanup on disconnect
          if (!this.barSubscriptions.has(sessionId)) {
            this.barSubscriptions.set(sessionId, []);
          }
          this.barSubscriptions.get(sessionId)!.push({
            ticker: subTicker,
            periodSeconds: subPeriod,
            callback: barCallback,
          });
          logger.info({ sessionId, ticker: subTicker, period: subPeriod }, 'Subscribed to realtime bars');
          socket.send(JSON.stringify({
            type: 'subscribe_bars_response',
            request_id: requestId,
            subscription_id: payload.subscription_id,
            success: true,
          }));
          break;
        }
        case 'unsubscribe_bars': {
          if (!ohlcService || !authContext) break;
          const unsubTicker: string = payload.symbol;
          const unsubPeriod: number = payload.period_seconds ?? payload.resolution ?? 60;
          const sessionId = authContext.sessionId;
          const subs = this.barSubscriptions.get(sessionId);
          if (subs) {
            const idx = subs.findIndex(
              s => s.ticker === unsubTicker && s.periodSeconds === unsubPeriod
            );
            if (idx >= 0) {
              const [removed] = subs.splice(idx, 1);
              ohlcService.unsubscribeFromTicker(unsubTicker, unsubPeriod, removed.callback);
              logger.info({ sessionId, ticker: unsubTicker, period: unsubPeriod }, 'Unsubscribed from realtime bars');
            }
          }
          socket.send(JSON.stringify({
            type: 'unsubscribe_bars_response',
            request_id: requestId,
            subscription_id: payload.subscription_id,
            success: true,
          }));
          break;
        }
        case 'evaluate_indicator': {
          // Direct MCP call — bypasses the agent/LLM for performance
--- a/gateway/src/clients/duckdb-client.ts
+++ b/gateway/src/clients/duckdb-client.ts
@@ -632,6 +632,118 @@ export class DuckDBClient {
    }
  }
  /**
   * Append a batch of image/audio blobs as a Parquet file in S3.
   * Called once per assistant turn that produces binary output.
   */
  async appendBlobs(
    userId: string,
    sessionId: string,
    messageId: string,
    blobs: Array<{
      id: string;
      user_id: string;
      session_id: string;
      message_id: string;
      blob_type: string;
      mime_type: string;
      data: string;
      caption: string | null;
      timestamp: number;
    }>
  ): Promise<void> {
    await this.initialize();
    if (!this.conversationsBucket || blobs.length === 0) {
      return;
    }
    const now = new Date();
    const year = now.getUTCFullYear();
    const month = String(now.getUTCMonth() + 1).padStart(2, '0');
    const s3Path = `s3://${this.conversationsBucket}/gateway/blobs/year=${year}/month=${month}/user_id=${userId}/${sessionId}_${messageId}.parquet`;
    const tempTable = `blob_flush_${Date.now()}`;
    try {
      await this.query(`
        CREATE TEMP TABLE ${tempTable} (
          id VARCHAR,
          user_id VARCHAR,
          session_id VARCHAR,
          message_id VARCHAR,
          blob_type VARCHAR,
          mime_type VARCHAR,
          data VARCHAR,
          caption VARCHAR,
          timestamp BIGINT
        )
      `);
      for (const blob of blobs) {
        await this.query(
          `INSERT INTO ${tempTable} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
          [blob.id, blob.user_id, blob.session_id, blob.message_id, blob.blob_type, blob.mime_type, blob.data, blob.caption, blob.timestamp]
        );
      }
      await this.query(`COPY ${tempTable} TO '${s3Path}' (FORMAT PARQUET)`);
      this.logger.info({ userId, sessionId, messageId, count: blobs.length, s3Path }, 'Blobs flushed to Parquet');
    } finally {
      await this.query(`DROP TABLE IF EXISTS ${tempTable}`).catch(() => {});
    }
  }
  /**
   * Query blobs from S3 by userId/sessionId, optionally filtered to specific blob IDs.
   */
  async queryBlobs(
    userId: string,
    sessionId: string,
    blobIds?: string[]
  ): Promise<any[]> {
    await this.initialize();
    try {
      const tablePath = await this.getTablePath(this.namespace, 'blobs', this.catalogUri);
      if (!tablePath) {
        // Fallback: scan per-turn Parquet files written directly to S3
        if (this.conversationsBucket) {
          this.logger.debug({ userId, sessionId }, 'REST catalog miss, scanning blob Parquet files');
          const parquetPath = `s3://${this.conversationsBucket}/gateway/blobs/**/user_id=${userId}/${sessionId}_*.parquet`;
          const idClause = blobIds?.length
            ? `WHERE id IN (${blobIds.map(id => `'${id.replace(/'/g, "''")}'`).join(', ')})`
            : '';
          try {
            return await this.query(`SELECT * FROM read_parquet('${parquetPath}') ${idClause} ORDER BY timestamp ASC`);
          } catch {
            // No blobs yet for this session
          }
        }
        return [];
      }
      const idFilter = blobIds?.length
        ? `AND id IN (${blobIds.map(() => '?').join(', ')})`
        : '';
      const params: any[] = [userId, sessionId, ...(blobIds ?? [])];
      const sql = `
        SELECT id, user_id, session_id, message_id, blob_type, mime_type, data, caption, timestamp
        FROM iceberg_scan('${tablePath}')
        WHERE user_id = ? AND session_id = ? ${idFilter}
        ORDER BY timestamp ASC
      `;
      const rows = await this.query(sql, params);
      this.logger.info({ userId, sessionId, count: rows.length }, 'Loaded blobs from Iceberg');
      return rows.map((row: any) => ({ ...row, timestamp: Number(row.timestamp) }));
    } catch (error: any) {
      this.logger.error({ error: error.message, userId, sessionId }, 'Failed to query blobs');
      return [];
    }
  }
  /**
   * Close the DuckDB connection
   */
--- a/gateway/src/clients/iceberg-client.ts
+++ b/gateway/src/clients/iceberg-client.ts
@@ -45,6 +45,21 @@ export interface IcebergMessage {
  timestamp: number; // nanoseconds
 }
 /**
 * Blob record for Iceberg storage (images, audio, etc.)
 */
 export interface IcebergBlob {
  id: string;
  user_id: string;
  session_id: string;
  message_id: string;
  blob_type: string;
  mime_type: string;
  data: string; // base64
  caption: string | null;
  timestamp: number; // microseconds
 }
 /**
 * Checkpoint record for Iceberg storage
 */
@@ -153,6 +168,25 @@ export class IcebergClient {
    return this.duckdb.appendMessages(userId, sessionId, messages);
  }
  /**
   * Append blobs for one assistant turn as a Parquet file in S3.
   */
  async appendBlobs(
    userId: string,
    sessionId: string,
    messageId: string,
    blobs: IcebergBlob[]
  ): Promise<void> {
    return this.duckdb.appendBlobs(userId, sessionId, messageId, blobs);
  }
  /**
   * Query blobs from S3/Iceberg, optionally filtered to specific blob IDs.
   */
  async queryBlobs(userId: string, sessionId: string, blobIds?: string[]): Promise<IcebergBlob[]> {
    return this.duckdb.queryBlobs(userId, sessionId, blobIds);
  }
  /**
   * Get table metadata
   */
--- a/gateway/src/clients/qdrant-client.ts
+++ b/gateway/src/clients/qdrant-client.ts
@@ -298,6 +298,13 @@ export class QdrantClient {
        pointsCount: info.points_count || 0,
      };
    } catch (error) {
      // If the collection was lost (e.g. Qdrant restarted without the gateway restarting),
      // recreate it and return zeroed stats rather than propagating the error.
      if ((error as any)?.status === 404) {
        this.logger.warn({ collection: this.collectionName }, 'Collection missing, recreating...');
        await this.initialize();
        return { vectorsCount: 0, indexedVectorsCount: 0, pointsCount: 0 };
      }
      this.logger.error({ error }, 'Failed to get collection info');
      throw error;
    }
--- a/gateway/src/clients/zmq-protocol.ts
+++ b/gateway/src/clients/zmq-protocol.ts
@@ -20,6 +20,22 @@ import type {
  NotificationStatus,
 } from '../types/ohlc.js';
 export const OHLC_BAR_TOPIC_PATTERN = /^(.+)\|ohlc:(\d+)$/;
 /** Decoded realtime OHLC bar received from the XPUB market data stream */
 export interface RealtimeBar {
  topic: string;       // e.g., "BTC/USDT.BINANCE|ohlc:60"
  ticker: string;      // e.g., "BTC/USDT.BINANCE"
  periodSeconds: number;
  /** Window open time in nanoseconds since epoch */
  timestamp: bigint;
  open: number;
  high: number;
  low: number;
  close: number;
  volume: number;
 }
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = dirname(__filename);
@@ -39,14 +55,17 @@ export enum MessageType {
 const protoDir = join(__dirname, '../..', 'protobuf');
 const root = new protobuf.Root();
-// Load proto file and parse it
+// Load proto files
 const ingestorProto = readFileSync(join(protoDir, 'ingestor.proto'), 'utf8');
 const ohlcProto = readFileSync(join(protoDir, 'ohlc.proto'), 'utf8');
 protobuf.parse(ingestorProto, root);
 protobuf.parse(ohlcProto, root);
 // Export message types
 const SubmitHistoricalRequestType = root.lookupType('SubmitHistoricalRequest');
 const SubmitResponseType = root.lookupType('SubmitResponse');
 const HistoryReadyNotificationType = root.lookupType('HistoryReadyNotification');
 const OHLCType = root.lookupType('OHLC');
 /**
 * Encode SubmitHistoricalRequest to ZMQ frames
@@ -178,3 +197,39 @@ export function decodeHistoryReadyNotification(frames: Buffer[]): HistoryReadyNo
    completed_at: BigInt(payload.completedAt),
  };
 }
 /**
 * Decode a realtime OHLC bar from ZMQ SUB frames.
 * Frame layout: [topic][version][0x04 OHLC type + OHLC protobuf bytes]
 *
 * Returns null if the topic doesn't match the realtime bar pattern or decoding fails.
 */
 export function decodeRealtimeBar(frames: Buffer[]): RealtimeBar | null {
  if (frames.length < 3) return null;
  const topic = frames[0].toString();
  const match = OHLC_BAR_TOPIC_PATTERN.exec(topic);
  if (!match) return null;
  const ticker = match[1];
  const periodSeconds = parseInt(match[2], 10);
  const messageFrame = frames[2];
  if (messageFrame[0] !== 0x04) return null; // Must be OHLC type
  const payloadBuffer = messageFrame.slice(1);
  const decoded = OHLCType.decode(payloadBuffer);
  const ohlc = OHLCType.toObject(decoded, { longs: String, defaults: true });
  return {
    topic,
    ticker,
    periodSeconds,
    timestamp: BigInt(ohlc.timestamp ?? '0'),
    open: Number(ohlc.open ?? 0),
    high: Number(ohlc.high ?? 0),
    low: Number(ohlc.low ?? 0),
    close: Number(ohlc.close ?? 0),
    volume: Number(ohlc.volume ?? 0),
  };
 }
--- a/gateway/src/clients/zmq-relay-client.ts
+++ b/gateway/src/clients/zmq-relay-client.ts
@@ -17,6 +17,9 @@ import {
  encodeSubmitHistoricalRequest,
  decodeSubmitResponse,
  decodeHistoryReadyNotification,
  decodeRealtimeBar,
  OHLC_BAR_TOPIC_PATTERN,
  type RealtimeBar,
 } from './zmq-protocol.js';
 import type {
  SubmitHistoricalRequest,
@@ -27,6 +30,9 @@ import {
  NotificationStatus,
 } from '../types/ohlc.js';
 export type BarUpdateCallback = (bar: RealtimeBar) => void;
 export type { RealtimeBar };
 export interface ZMQRelayConfig {
  relayRequestEndpoint: string;       // e.g., "tcp://relay:5559"
  relayNotificationEndpoint: string;  // e.g., "tcp://relay:5558"
@@ -57,6 +63,12 @@ export class ZMQRelayClient {
  private notificationTopic: string;
  private pendingRequests: Map<string, PendingRequest> = new Map();
  /** Ref count per ZMQ topic (gateway-level dedup before ZMQ subscribe/unsubscribe) */
  private topicRefs: Map<string, number> = new Map();
  /** Callbacks registered by WebSocket sessions for realtime bar updates */
  private barCallbacks: Map<string, Set<BarUpdateCallback>> = new Map();
  private connected = false;
  private notificationListenerRunning = false;
@@ -253,8 +265,6 @@ export class ZMQRelayClient {
            // Handle metadata update notifications
            if (topic === 'METADATA_UPDATE') {
              this.logger.info('Received METADATA_UPDATE notification');
              // Call the onMetadataUpdate callback if configured
              if (this.config.onMetadataUpdate) {
                try {
                  await this.config.onMetadataUpdate();
@@ -265,6 +275,20 @@ export class ZMQRelayClient {
              continue;
            }
            // Handle realtime OHLC bar updates (topic pattern: "{ticker}|ohlc:{period}")
            if (OHLC_BAR_TOPIC_PATTERN.test(topic)) {
              const bar = decodeRealtimeBar(Array.from(frames));
              if (bar) {
                const callbacks = this.barCallbacks.get(topic);
                if (callbacks) {
                  for (const cb of callbacks) {
                    try { cb(bar); } catch (e) { /* ignore callback errors */ }
                  }
                }
              }
              continue;
            }
            // Handle history ready notifications
            const notification = decodeHistoryReadyNotification(Array.from(frames));
@@ -308,6 +332,69 @@ export class ZMQRelayClient {
    this.logger.debug('Notification listener started');
  }
  /**
   * Subscribe to realtime OHLC bars for a ticker+period.
   *
   * ZMQ subscribe is only called on the 0→1 transition (first subscriber).
   * This triggers the relay XPUB → Flink subscription detection → ingestor activation.
   *
   * @param callback  Called whenever a new bar arrives for this topic
   */
  subscribeToTicker(ticker: string, periodSeconds: number, callback: BarUpdateCallback): void {
    const topic = `${ticker}|ohlc:${periodSeconds}`;
    // Register callback
    if (!this.barCallbacks.has(topic)) {
      this.barCallbacks.set(topic, new Set());
    }
    this.barCallbacks.get(topic)!.add(callback);
    // ZMQ subscribe on first ref
    const prev = this.topicRefs.get(topic) ?? 0;
    this.topicRefs.set(topic, prev + 1);
    if (prev === 0 && this.subSocket) {
      this.subSocket.subscribe(topic);
      this.logger.info({ topic }, 'ZMQ subscribed to realtime topic');
    }
  }
  /**
   * Unsubscribe a callback from realtime OHLC bars.
   * ZMQ unsubscribe is only called on the 1→0 transition (last subscriber).
   */
  unsubscribeFromTicker(ticker: string, periodSeconds: number, callback: BarUpdateCallback): void {
    const topic = `${ticker}|ohlc:${periodSeconds}`;
    const callbacks = this.barCallbacks.get(topic);
    if (callbacks) {
      callbacks.delete(callback);
      if (callbacks.size === 0) {
        this.barCallbacks.delete(topic);
      }
    }
    const prev = this.topicRefs.get(topic) ?? 0;
    if (prev <= 1) {
      this.topicRefs.delete(topic);
      if (this.subSocket) {
        this.subSocket.unsubscribe(topic);
        this.logger.info({ topic }, 'ZMQ unsubscribed from realtime topic');
      }
    } else {
      this.topicRefs.set(topic, prev - 1);
    }
  }
  /**
   * Remove all subscriptions for a set of (topic, callback) pairs.
   * Convenience method for WebSocket disconnect cleanup.
   */
  cleanupSubscriptions(subscriptions: Array<{ ticker: string; periodSeconds: number; callback: BarUpdateCallback }>): void {
    for (const { ticker, periodSeconds, callback } of subscriptions) {
      this.unsubscribeFromTicker(ticker, periodSeconds, callback);
    }
  }
  /**
   * Close the client and cleanup resources
   */
--- a/gateway/src/harness/agent-harness.ts
+++ b/gateway/src/harness/agent-harness.ts
@@ -4,6 +4,7 @@ import type { FastifyBaseLogger } from 'fastify';
 import type { License } from '../types/user.js';
 import { ChannelType } from '../types/user.js';
 import type { ConversationStore } from './memory/conversation-store.js';
 import type { BlobStore } from './memory/blob-store.js';
 import type { InboundMessage, OutboundMessage } from '../types/messages.js';
 import { MCPClientConnector } from './mcp-client.js';
 import { LLMProviderFactory, type ProviderConfig } from '../llm/provider.js';
@@ -14,13 +15,16 @@ import type { ChannelAdapter, PathTriggerContext } from '../workspace/index.js';
 import type { ResearchSubagent } from './subagents/research/index.js';
 import type { IndicatorSubagent } from './subagents/indicator/index.js';
 import type { WebExploreSubagent } from './subagents/web-explore/index.js';
 import type { StrategySubagent } from './subagents/strategy/index.js';
 import type { DynamicStructuredTool } from '@langchain/core/tools';
 import { getToolRegistry } from '../tools/tool-registry.js';
 import type { MCPToolInfo } from '../tools/mcp/mcp-tool-wrapper.js';
 import { createResearchAgentTool } from '../tools/platform/research-agent.tool.js';
 import { createIndicatorAgentTool } from '../tools/platform/indicator-agent.tool.js';
 import { createWebExploreAgentTool } from '../tools/platform/web-explore-agent.tool.js';
 import { createStrategyAgentTool } from '../tools/platform/strategy-agent.tool.js';
 import { createUserContext } from './memory/session-context.js';
 import type { HarnessEvent } from './harness-events.js';
 import { readFile } from 'fs/promises';
 import { join, dirname } from 'path';
 import { fileURLToPath } from 'url';
@@ -54,10 +58,12 @@ export type HarnessFactory = (sessionConfig: HarnessSessionConfig) => AgentHarne
 export interface AgentHarnessConfig extends HarnessSessionConfig {
  providerConfig: ProviderConfig;
  conversationStore?: ConversationStore;
  blobStore?: BlobStore;
  historyLimit: number;
  researchSubagent?: ResearchSubagent;
  indicatorSubagent?: IndicatorSubagent;
  webExploreSubagent?: WebExploreSubagent;
  strategySubagent?: StrategySubagent;
 }
 /**
@@ -87,6 +93,8 @@ export class AgentHarness {
  private conversationStore?: ConversationStore;
  private indicatorSubagent?: IndicatorSubagent;
  private webExploreSubagent?: WebExploreSubagent;
  private strategySubagent?: StrategySubagent;
  private blobStore?: BlobStore;
  private abortController: AbortController | null = null;
  constructor(config: AgentHarnessConfig) {
@@ -96,10 +104,12 @@ export class AgentHarness {
    this.researchSubagent = config.researchSubagent;
    this.indicatorSubagent = config.indicatorSubagent;
    this.webExploreSubagent = config.webExploreSubagent;
    this.strategySubagent = config.strategySubagent;
    this.modelFactory = new LLMProviderFactory(config.providerConfig, config.logger);
    this.modelRouter = new ModelRouter(this.modelFactory, config.logger);
    this.conversationStore = config.conversationStore;
    this.blobStore = config.blobStore;
    this.mcpClient = new MCPClientConnector({
      userId: config.userId,
@@ -419,17 +429,75 @@ export class AgentHarness {
    }
  }
  /**
   * Initialize strategy subagent
   */
  private async initializeStrategySubagent(): Promise<void> {
    if (this.strategySubagent) {
      this.config.logger.debug('Strategy subagent already provided');
      return;
    }
    this.config.logger.debug('Creating strategy subagent for session');
    try {
      const { createStrategySubagent } = await import('./subagents/strategy/index.js');
      const { model } = await this.modelRouter.route(
        'trading strategy writing and backtesting',
        this.config.license,
        RoutingStrategy.COMPLEXITY,
        this.config.userId
      );
      const toolRegistry = getToolRegistry();
      const strategyTools = await toolRegistry.getToolsForAgent(
        'strategy',
        this.mcpClient,
        this.availableMCPTools,
        this.workspaceManager,
        undefined,
        undefined
      );
      const strategySubagentPath = join(__dirname, 'subagents', 'strategy');
      this.config.logger.debug({ strategySubagentPath }, 'Using strategy subagent path');
      this.strategySubagent = await createStrategySubagent(
        model,
        this.config.logger,
        strategySubagentPath,
        this.mcpClient,
        strategyTools
      );
      this.config.logger.info(
        {
          toolCount: strategyTools.length,
          toolNames: strategyTools.map(t => t.name),
        },
        'Strategy subagent created successfully'
      );
    } catch (error) {
      this.config.logger.error(
        { error, errorMessage: (error as Error).message, stack: (error as Error).stack },
        'Failed to create strategy subagent'
      );
      // Don't throw — strategy subagent is optional
    }
  }
  /**
   * Execute model with tool calling loop
   * Handles multi-turn tool calls until the model produces a final text response
   */
-  private async executeWithToolCalling(
+  private async *executeWithToolCalling(
    model: any,
    messages: BaseMessage[],
    tools: DynamicStructuredTool[],
    maxIterations: number = 2,
    signal?: AbortSignal
-  ): Promise<string> {
+  ): AsyncGenerator<HarnessEvent> {
    this.config.logger.info(
      { toolCount: tools.length, maxIterations },
      'Starting tool calling loop'
@@ -437,6 +505,8 @@ export class AgentHarness {
    const messagesCopy = [...messages];
    let iterations = 0;
    // Track last char of last yielded text chunk to detect missing spaces between tokens
    let lastChunkTail = '';
    while (iterations < maxIterations) {
      if (signal?.aborted) break;
@@ -455,15 +525,24 @@ export class AgentHarness {
      try {
        const stream = await model.stream(messagesCopy, { signal });
        for await (const chunk of stream) {
          const contents: string[] = [];
          if (typeof chunk.content === 'string' && chunk.content.length > 0) {
-            this.channelAdapter?.sendChunk(chunk.content);
+            contents.push(chunk.content);
          } else if (Array.isArray(chunk.content)) {
            for (const block of chunk.content) {
-              if (block.type === 'text' && block.text) {
+              if (block.type === 'text' && block.text) contents.push(block.text);
                this.channelAdapter?.sendChunk(block.text);
              }
            }
          }
          for (const content of contents) {
            // DeepInfra/GLM streams tokens without leading spaces; inject one when
            // both the tail of the previous chunk and the head of this chunk are
            // word characters (\w), which would otherwise merge two words.
            if (lastChunkTail && /\w/.test(lastChunkTail) && /\w/.test(content[0])) {
              yield { type: 'chunk', content: ' ' };
            }
            lastChunkTail = content[content.length - 1];
            yield { type: 'chunk', content };
          }
          response = response ? response.concat(chunk) : chunk;
        }
      } catch (invokeError: any) {
@@ -486,6 +565,8 @@ export class AgentHarness {
          contentLength: typeof response.content === 'string' ? response.content.length : 0,
          hasToolCalls: !!response.tool_calls,
          toolCallCount: response.tool_calls?.length || 0,
          usageMetadata: (response as any).usage_metadata,
          finishReason: (response as any).response_metadata?.finish_reason,
        },
        'Model response received'
      );
@@ -508,7 +589,8 @@ export class AgentHarness {
          { finalContentLength: finalContent.length, iterations },
          'Tool calling loop complete - no more tool calls'
        );
-        return finalContent;
+        yield { type: 'done', content: finalContent };
        return;
      }
      this.config.logger.info(
@@ -540,11 +622,32 @@ export class AgentHarness {
        }
        try {
-          this.channelAdapter?.sendToolCall?.(toolCall.name, this.getToolLabel(toolCall.name));
+          yield { type: 'tool_call', toolName: toolCall.name, label: this.getToolLabel(toolCall.name) };
          const result = await tool.func(toolCall.args);
-          // Process result to extract images and send them via channel adapter
+          // Use streamFunc when available (subagent tools) to forward intermediate events inline
-          const processedResult = this.processToolResult(result, toolCall.name);
+          let result: string;
          const streamFunc = (tool as any).streamFunc as ((args: any, signal?: AbortSignal) => AsyncGenerator<import('./harness-events.js').HarnessEvent, string>) | undefined;
          if (streamFunc) {
            const gen = streamFunc(toolCall.args, signal);
            let next = await gen.next();
            while (!next.done) {
              if (signal?.aborted) {
                gen.return?.('');
                break;
              }
              yield next.value;
              next = await gen.next();
            }
            result = next.done ? next.value : '';
          } else {
            result = await tool.func(toolCall.args);
          }
          // Extract images from result and yield them; get text-only version for LLM
          const { cleanedResult: processedResult, images } = this.extractImagesFromToolResult(result, toolCall.name);
          for (const img of images) {
            yield { type: 'image', data: img.data, mimeType: img.mimeType, caption: img.caption };
          }
          this.config.logger.debug(
            {
@@ -567,6 +670,12 @@ export class AgentHarness {
            'Tool execution completed'
          );
        } catch (error) {
          // Clean stop — abort signal fired during tool execution; exit without error message
          if (signal?.aborted || (error as Error)?.name === 'AbortError') {
            this.config.logger.info({ tool: toolCall.name }, 'Tool execution aborted by stop signal');
            return;
          }
          this.config.logger.error(
            {
              error,
@@ -578,6 +687,8 @@ export class AgentHarness {
            'Tool execution failed'
          );
          yield { type: 'error' as const, source: toolCall.name, fatal: false };
          messagesCopy.push(
            new ToolMessage({
              content: `Error: ${error}`,
@@ -586,11 +697,15 @@ export class AgentHarness {
          );
        }
      }
      // After all tool calls complete, emit a space separator before the next LLM streaming pass
      yield { type: 'chunk', content: ' ' };
      lastChunkTail = ' ';
    }
-    // Max iterations reached - return what we have
+    // Max iterations reached - yield done with apology
    this.config.logger.warn('Max tool calling iterations reached');
-    return 'I apologize, but I encountered an issue processing your request. Please try rephrasing your question.';
+    yield { type: 'done', content: 'I apologize, but I encountered an issue processing your request. Please try rephrasing your question.' };
  }
  /**
@@ -617,162 +732,222 @@ export class AgentHarness {
  }
  /**
-   * Handle incoming message from user
+   * Stream events for an incoming user message.
   * Yields typed HarnessEvents (chunk, tool_call, image, done) and saves the
   * conversation to the store once the done event has been emitted.
   */
-  async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
+  async *streamMessage(message: InboundMessage): AsyncGenerator<HarnessEvent> {
    this.config.logger.info(
      { messageId: message.messageId, userId: message.userId, content: message.content.substring(0, 100) },
      'Processing user message'
    );
-    try {
+    // 1. Build system prompt from template
-      // 1. Build system prompt from template
+    this.config.logger.debug('Building system prompt');
-      this.config.logger.debug('Building system prompt');
+    const systemPrompt = await this.buildSystemPrompt();
-      const systemPrompt = await this.buildSystemPrompt();
+    this.config.logger.debug({ systemPromptLength: systemPrompt.length }, 'System prompt built');
      this.config.logger.debug({ systemPromptLength: systemPrompt.length }, 'System prompt built');
-      // 2. Load recent conversation history
+    // 2. Load recent conversation history
-      const channelKey = this.config.channelType ?? ChannelType.WEBSOCKET;
+    const channelKey = this.config.channelType ?? ChannelType.WEBSOCKET;
-      let storedMessages = this.conversationStore
+    let storedMessages = this.conversationStore
-        ? await this.conversationStore.getRecentMessages(
+      ? await this.conversationStore.getRecentMessages(
            this.config.userId, this.config.sessionId, this.config.historyLimit, channelKey
          )
        : [];
      // First turn: seed conversation history with current workspace state
      if (storedMessages.length === 0 && this.workspaceManager && this.conversationStore) {
        const workspaceJSON = this.workspaceManager.serializeState();
        const content = `[Workspace State]\n\`\`\`json\n${workspaceJSON}\n\`\`\``;
        await this.conversationStore.saveMessage(
          this.config.userId, this.config.sessionId,
          'workspace', content, { isWorkspaceContext: true }, channelKey
        );
        storedMessages = await this.conversationStore.getRecentMessages(
          this.config.userId, this.config.sessionId, this.config.historyLimit, channelKey
-        );
+        )
-      }
+      : [];
-      const history = this.conversationStore
+    // First turn: seed conversation history with current workspace state
-        ? this.conversationStore.toLangChainMessages(storedMessages)
+    if (storedMessages.length === 0 && this.workspaceManager && this.conversationStore) {
-        : [];
+      const workspaceJSON = this.workspaceManager.serializeState();
-      this.config.logger.debug({ historyLength: history.length }, 'Conversation history loaded');
+      const content = `[Workspace State]\n\`\`\`json\n${workspaceJSON}\n\`\`\``;
-
+      await this.conversationStore.saveMessage(
-      // 4. Get the configured model
+        this.config.userId, this.config.sessionId,
-      this.config.logger.debug('Routing to model');
+        'workspace', content, { isWorkspaceContext: true }, channelKey
      const { model, middleware } = await this.modelRouter.route(
        message.content,
        this.config.license,
        RoutingStrategy.COMPLEXITY,
        this.config.userId
      );
-      this.middleware = middleware;
+      storedMessages = await this.conversationStore.getRecentMessages(
-      this.config.logger.info({ modelName: model.constructor.name }, 'Model selected');
+        this.config.userId, this.config.sessionId, this.config.historyLimit, channelKey
      // 5. Build LangChain messages
      const langchainMessages = this.buildLangChainMessages(systemPrompt, history, message.content);
      this.config.logger.debug({ messageCount: langchainMessages.length }, 'LangChain messages built');
      // 6. Get tools for main agent from registry
      const toolRegistry = getToolRegistry();
      const tools = await toolRegistry.getToolsForAgent(
        'main',
        this.mcpClient,
        this.availableMCPTools,
        this.workspaceManager  // Pass session workspace manager
      );
    }
-      // Build shared subagent context
+    const history = this.conversationStore
-      const subagentContext = {
+      ? this.conversationStore.toLangChainMessages(storedMessages)
-        userContext: createUserContext({
+      : [];
-          userId: this.config.userId,
+    this.config.logger.debug({ historyLength: history.length }, 'Conversation history loaded');
          sessionId: this.config.sessionId,
          license: this.config.license,
          channelType: this.config.channelType ?? ChannelType.WEBSOCKET,
          channelUserId: this.config.channelUserId ?? this.config.userId,
        }),
      };
-      // Add research subagent as a tool if available
+    // 4. Get the configured model
-      if (this.researchSubagent) {
+    this.config.logger.debug('Routing to model');
-        tools.push(createResearchAgentTool({
+    const { model, middleware } = await this.modelRouter.route(
-          researchSubagent: this.researchSubagent,
+      message.content,
-          context: subagentContext,
+      this.config.license,
-          logger: this.config.logger,
+      RoutingStrategy.COMPLEXITY,
-        }));
+      this.config.userId
-      }
+    );
    this.middleware = middleware;
    this.config.logger.info({ modelName: model.constructor.name }, 'Model selected');
-      // Add indicator subagent as a tool if available
+    // 5. Build LangChain messages
-      if (this.indicatorSubagent) {
+    const langchainMessages = this.buildLangChainMessages(systemPrompt, history, message.content);
-        tools.push(createIndicatorAgentTool({
+    this.config.logger.debug({ messageCount: langchainMessages.length }, 'LangChain messages built');
          indicatorSubagent: this.indicatorSubagent,
          context: subagentContext,
          logger: this.config.logger,
        }));
      }
-      // Add web explore subagent as a tool if available
+    // 6. Get tools for main agent from registry
-      if (this.webExploreSubagent) {
+    const toolRegistry = getToolRegistry();
-        tools.push(createWebExploreAgentTool({
+    const tools = await toolRegistry.getToolsForAgent(
-          webExploreSubagent: this.webExploreSubagent,
+      'main',
-          context: subagentContext,
+      this.mcpClient,
-          logger: this.config.logger,
+      this.availableMCPTools,
-        }));
+      this.workspaceManager
-      }
+    );
    // Build shared subagent context
    const subagentContext = {
      userContext: createUserContext({
        userId: this.config.userId,
        sessionId: this.config.sessionId,
        license: this.config.license,
        channelType: this.config.channelType ?? ChannelType.WEBSOCKET,
        channelUserId: this.config.channelUserId ?? this.config.userId,
      }),
    };
    if (this.researchSubagent) {
      tools.push(createResearchAgentTool({
        researchSubagent: this.researchSubagent,
        context: subagentContext,
        logger: this.config.logger,
      }));
    }
    if (this.indicatorSubagent) {
      tools.push(createIndicatorAgentTool({
        indicatorSubagent: this.indicatorSubagent,
        context: subagentContext,
        logger: this.config.logger,
      }));
    }
    if (this.webExploreSubagent) {
      tools.push(createWebExploreAgentTool({
        webExploreSubagent: this.webExploreSubagent,
        context: subagentContext,
        logger: this.config.logger,
      }));
    }
    if (!this.strategySubagent) {
      await this.initializeStrategySubagent();
    }
    if (this.strategySubagent) {
      tools.push(createStrategyAgentTool({
        strategySubagent: this.strategySubagent,
        context: subagentContext,
        logger: this.config.logger,
      }));
    }
    this.config.logger.info(
      { toolCount: tools.length, toolNames: tools.map(t => t.name) },
      'Tools loaded for main agent'
    );
    // Apply middleware (e.g. Anthropic prompt caching)
    const processedMessages = this.middleware
      ? this.middleware.processMessages(langchainMessages, tools)
      : langchainMessages;
    // 7. Bind tools to model
    const modelWithTools = tools.length > 0 && model.bindTools ? model.bindTools(tools) : model;
    if (tools.length > 0) {
      this.config.logger.info(
-        {
+        { modelType: modelWithTools.constructor.name, toolsBound: tools.length > 0 && !!model.bindTools },
-          toolCount: tools.length,
+        'Model bound with tools'
          toolNames: tools.map(t => t.name),
        },
        'Tools loaded for main agent'
      );
    }
-      // Apply middleware (e.g. Anthropic prompt caching)
+    // 8. Stream tool calling loop and save conversation on completion
-      const processedMessages = this.middleware
+    this.config.logger.info('Invoking LLM with tool support');
-        ? this.middleware.processMessages(langchainMessages, tools)
+    this.abortController = new AbortController();
-        : langchainMessages;
+    let finalContent = '';
-
+    const collectedImages: Array<{ data: string; mimeType: string; caption?: string }> = [];
-      // 7. Bind tools to model
+    try {
-      const modelWithTools = tools.length > 0 && model.bindTools ? model.bindTools(tools) : model;
+      for await (const event of this.executeWithToolCalling(modelWithTools, processedMessages, tools, 10, this.abortController.signal)) {
-
+        if (event.type === 'done') {
-      if (tools.length > 0) {
+          finalContent = event.content;
-        this.config.logger.info(
+          this.config.logger.info({ responseLength: finalContent.length }, 'LLM response received');
-          { modelType: modelWithTools.constructor.name, toolsBound: tools.length > 0 && !!model.bindTools },
+        } else if (event.type === 'image') {
-          'Model bound with tools'
+          collectedImages.push({ data: event.data, mimeType: event.mimeType, caption: event.caption });
-        );
+        }
        yield event;
      }
-
+    } catch (error) {
-      // 8. Call LLM with tool calling loop
+      if ((error as Error)?.name === 'AbortError') {
-      this.config.logger.info('Invoking LLM with tool support');
+        this.config.logger.info('Agent harness interrupted by stop signal');
-      this.abortController = new AbortController();
+      } else {
-      const assistantMessage = await this.executeWithToolCalling(modelWithTools, processedMessages, tools, 10, this.abortController.signal);
+        this.config.logger.error({ error }, 'Fatal error in agent harness');
        yield { type: 'error' as const, source: 'agent harness', fatal: true };
      }
    } finally {
      this.abortController = null;
      if (finalContent && this.conversationStore) {
        // Write blobs to S3 and capture their IDs for message metadata
        let blobRefs: Array<{ id: string; mimeType: string; caption?: string }> = [];
        if (collectedImages.length > 0 && this.blobStore) {
          const assistantMsgId = `${this.config.userId}:${this.config.sessionId}:${Date.now()}`;
          const blobIds = await this.blobStore.writeBlobs(
            this.config.userId, this.config.sessionId, assistantMsgId,
            collectedImages.map(img => ({ blobType: 'image' as const, mimeType: img.mimeType, data: img.data, caption: img.caption }))
          );
          blobRefs = blobIds.map((id, i) => ({ id, mimeType: collectedImages[i].mimeType, caption: collectedImages[i].caption }));
        }
      this.config.logger.info(
        { responseLength: assistantMessage.length },
        'LLM response received'
      );
      // Save user message and assistant response to conversation store
      if (this.conversationStore) {
        await this.conversationStore.saveMessage(
          this.config.userId, this.config.sessionId, 'user', message.content, undefined, channelKey
        );
        await this.conversationStore.saveMessage(
-          this.config.userId, this.config.sessionId, 'assistant', assistantMessage, undefined, channelKey
+          this.config.userId, this.config.sessionId, 'assistant', finalContent,
          blobRefs.length > 0 ? { blobs: blobRefs } : undefined,
          channelKey
        );
      }
    }
  }
-      return {
+  /**
-        messageId: `msg_${Date.now()}`,
+   * Handle incoming message from user.
-        sessionId: message.sessionId,
+   * Consumes streamMessage and dispatches events to the channel adapter for
-        content: assistantMessage,
+   * backward compatibility with Telegram and other non-streaming callers.
-        timestamp: new Date(),
+   */
-      };
+  async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
    let finalContent = '';
    try {
      for await (const event of this.streamMessage(message)) {
        switch (event.type) {
          case 'chunk':
            this.channelAdapter?.sendChunk(event.content);
            break;
          case 'tool_call':
            this.channelAdapter?.sendToolCall?.(event.toolName, event.label);
            break;
          case 'image':
            this.channelAdapter?.sendImage({ data: event.data, mimeType: event.mimeType, caption: event.caption });
            break;
          case 'error':
            this.channelAdapter?.sendText?.({ text: `An unrecoverable error occurred in the ${event.source}.` });
            break;
          case 'done':
            finalContent = event.content;
            break;
        }
      }
    } catch (error) {
      this.config.logger.error({ error }, 'Error processing message');
      throw error;
    }
    return {
      messageId: `msg_${Date.now()}`,
      sessionId: message.sessionId,
      content: finalContent,
      timestamp: new Date(),
    };
  }
  /**
@@ -817,21 +992,27 @@ export class AgentHarness {
      python_write: 'Coding...',
      python_read: 'Inspecting...',
      execute_research: 'Running script...',
-      backtest_strategy: 'Running backtest...',
+      backtest_strategy: 'Backtesting...',
      list_active_strategies: 'Checking active strategies...',
      web_explore: 'Searching the web...',
      strategy: 'Coding a strategy...',
    };
-    return labels[toolName] ?? `Running ${toolName}...`;
+    return labels[toolName] ?? `Running ${toolName} tool...`;
  }
  /**
   * Process tool result to extract images and send via channel adapter.
   * Returns text-only version for LLM context (no base64 image data).
   */
-  private processToolResult(result: string, toolName: string): string {
+  private extractImagesFromToolResult(
    result: string,
    toolName: string
  ): { cleanedResult: string; images: Array<{ data: string; mimeType: string; caption?: string }> } {
    const noImages = { cleanedResult: String(result || ''), images: [] };
    // Most tools return plain strings - only process JSON results
    if (!result || typeof result !== 'string') {
-      return String(result || '');
+      return noImages;
    }
    // Try to parse as JSON
@@ -840,7 +1021,7 @@ export class AgentHarness {
      parsedResult = JSON.parse(result);
    } catch {
      // Not JSON, return as-is
-      return result;
+      return noImages;
    }
    // Check if result has images array (from ResearchSubagent)
@@ -850,19 +1031,11 @@ export class AgentHarness {
        'Extracting images from tool result'
      );
-      // Send each image via channel adapter
+      const images: Array<{ data: string; mimeType: string; caption?: string }> = [];
      for (const image of parsedResult.images) {
        if (image.data && image.mimeType) {
-          if (this.channelAdapter) {
+          this.config.logger.debug({ mimeType: image.mimeType }, 'Extracted image from tool result');
-            this.config.logger.debug({ mimeType: image.mimeType }, 'Sending image to channel');
+          images.push({ data: image.data, mimeType: image.mimeType, caption: undefined });
            this.channelAdapter.sendImage({
              data: image.data,
              mimeType: image.mimeType,
              caption: undefined,
            });
          } else {
            this.config.logger.warn('No channel adapter set, cannot send image');
          }
        }
      }
@@ -872,15 +1045,13 @@ export class AgentHarness {
        images: undefined,
        imageCount: parsedResult.images.length,
      };
      // Clean up undefined values
      Object.keys(textOnlyResult).forEach(key => {
        if (textOnlyResult[key] === undefined) {
          delete textOnlyResult[key];
        }
      });
-      return JSON.stringify(textOnlyResult);
+      return { cleanedResult: JSON.stringify(textOnlyResult), images };
    }
    // Check for nested chart_images object
@@ -890,20 +1061,12 @@ export class AgentHarness {
        'Extracting chart images from tool result'
      );
-      // Send each chart image via channel adapter
+      const images: Array<{ data: string; mimeType: string; caption?: string }> = [];
      for (const [chartId, chartData] of Object.entries(parsedResult.chart_images)) {
        const chart = chartData as any;
        if (chart.type === 'image' && chart.data) {
-          if (this.channelAdapter) {
+          this.config.logger.debug({ chartId }, 'Extracted chart image from tool result');
-            this.config.logger.debug({ chartId }, 'Sending chart image to channel');
+          images.push({ data: chart.data, mimeType: 'image/png', caption: undefined });
            this.channelAdapter.sendImage({
              data: chart.data,
              mimeType: 'image/png',
              caption: undefined,
            });
          } else {
            this.config.logger.warn('No channel adapter set, cannot send chart image');
          }
        }
      }
@@ -913,19 +1076,17 @@ export class AgentHarness {
        chart_images: undefined,
        chartCount: Object.keys(parsedResult.chart_images).length,
      };
      // Clean up undefined values
      Object.keys(textOnlyResult).forEach(key => {
        if (textOnlyResult[key] === undefined) {
          delete textOnlyResult[key];
        }
      });
-      return JSON.stringify(textOnlyResult);
+      return { cleanedResult: JSON.stringify(textOnlyResult), images };
    }
-    // No images found, return stringified result
+    // No images found, return as-is
-    return result;
+    return { cleanedResult: result, images: [] };
  }
  /**
--- a/gateway/src/harness/harness-events.ts
+++ b/gateway/src/harness/harness-events.ts
@@ -0,0 +1,51 @@
 export interface ChunkEvent {
  type: 'chunk';
  content: string;
 }
 export interface ToolCallEvent {
  type: 'tool_call';
  toolName: string;
  label: string;
 }
 export interface ImageEvent {
  type: 'image';
  data: string;
  mimeType: string;
  caption?: string;
 }
 export interface DoneEvent {
  type: 'done';
  content: string;
 }
 export interface SubagentChunkEvent {
  type: 'subagent_chunk';
  agentName: string;
  content: string;
 }
 export interface SubagentThinkingEvent {
  type: 'subagent_thinking';
  agentName: string;
  content: string;
 }
 export interface SubagentToolCallEvent {
  type: 'subagent_tool_call';
  agentName: string;
  toolName: string;
  label: string;
 }
 export interface ErrorEvent {
  type: 'error';
  /** Name of the agent or tool where the error occurred */
  source: string;
  /** True if the error is unrecoverable and the chat session should end */
  fatal: boolean;
 }
 export type HarnessEvent = ChunkEvent | ToolCallEvent | ImageEvent | DoneEvent | SubagentChunkEvent | SubagentThinkingEvent | SubagentToolCallEvent | ErrorEvent;
--- a/gateway/src/harness/mcp-client.ts
+++ b/gateway/src/harness/mcp-client.ts
@@ -57,57 +57,74 @@ export class MCPClientConnector {
      this.client = null;
    }
-    try {
+    const maxAttempts = 5;
-      this.config.logger.info(
+    const retryDelayMs = 1500;
        { userId: this.config.userId, url: this.config.mcpServerUrl },
        'Connecting to user MCP server'
      );
-      this.client = new Client(
+    this.config.logger.info(
-        {
+      { userId: this.config.userId, url: this.config.mcpServerUrl },
-          name: 'dexorder-gateway',
+      'Connecting to user MCP server'
-          version: '0.1.0',
+    );
-        },
+
-        {
+    let lastError: unknown;
-          capabilities: {
+    for (let attempt = 1; attempt <= maxAttempts; attempt++) {
-            sampling: {},
+      try {
        this.client = new Client(
          {
            name: 'dexorder-gateway',
            version: '0.1.0',
          },
-        }
+          {
-      );
+            capabilities: {
              sampling: {},
            },
          }
        );
-      // Streamable HTTP: single /mcp endpoint, session tracked via mcp-session-id header
+        // Streamable HTTP: single /mcp endpoint, session tracked via mcp-session-id header
-      const transport = new StreamableHTTPClientTransport(
+        const transport = new StreamableHTTPClientTransport(
-        new URL(`${this.config.mcpServerUrl}/mcp`)
+          new URL(`${this.config.mcpServerUrl}/mcp`)
-      );
+        );
-      await this.client.connect(transport);
+        await this.client.connect(transport);
-      // Hook client.onerror to detect transport failures (e.g. sandbox restart returning
+        // Hook client.onerror to detect transport failures (e.g. sandbox restart returning
-      // 404 "session not found").  When fired, mark disconnected so the next callTool /
+        // 404 "session not found").  When fired, mark disconnected so the next callTool /
-      // listTools call triggers a full reconnect + initialize handshake.
+        // listTools call triggers a full reconnect + initialize handshake.
-      const connectedClient = this.client;
+        const connectedClient = this.client;
-      const origOnError = this.client.onerror;
+        const origOnError = this.client.onerror;
-      this.client.onerror = (error) => {
+        this.client.onerror = (error) => {
-        origOnError?.(error);
+          origOnError?.(error);
-        // Only act on the currently-active client (ignore stale closures after reconnect)
+          // Only act on the currently-active client (ignore stale closures after reconnect)
-        if (this.client === connectedClient && this.connected) {
+          if (this.client === connectedClient && this.connected) {
            this.config.logger.warn(
              { error },
              'MCP transport error — marking disconnected for lazy reconnect'
            );
            this.connected = false;
          }
        };
        this.connected = true;
        this.config.logger.info('Connected to user MCP server');
        return;
      } catch (error) {
        lastError = error;
        this.client = null;
        if (attempt < maxAttempts) {
          this.config.logger.warn(
-            { error },
+            { error, userId: this.config.userId, attempt, maxAttempts },
-            'MCP transport error — marking disconnected for lazy reconnect'
+            'MCP connect attempt failed, retrying...'
          );
-          this.connected = false;
+          await new Promise(resolve => setTimeout(resolve, retryDelayMs));
        }
-      };
+      }
      this.connected = true;
      this.config.logger.info('Connected to user MCP server');
    } catch (error) {
      this.config.logger.error(
        { error, userId: this.config.userId },
        'Failed to connect to user MCP server'
      );
      throw error;
    }
    this.config.logger.error(
      { error: lastError, userId: this.config.userId },
      'Failed to connect to user MCP server'
    );
    throw lastError;
  }
  /**
@@ -134,7 +151,9 @@ export class MCPClientConnector {
    try {
      this.config.logger.debug({ tool: name, args }, 'Calling MCP tool');
-      const result = await this.client!.callTool({ name, arguments: args });
+      // Use a generous timeout: execute_research runs a subprocess with a 300s limit,
      // so the default 60s MCP SDK timeout would fire before the script completes.
      const result = await this.client!.callTool({ name, arguments: args }, undefined, { timeout: 330000 });
      return result;
    } catch (error) {
      this.config.logger.error({ error, tool: name }, 'MCP tool call failed');
--- a/gateway/src/harness/memory/blob-store.ts
+++ b/gateway/src/harness/memory/blob-store.ts
@@ -0,0 +1,93 @@
 import type { FastifyBaseLogger } from 'fastify';
 import type { IcebergClient } from '../../clients/iceberg-client.js';
 export interface StoredBlob {
  id: string;
  userId: string;
  sessionId: string;
  messageId: string;
  blobType: 'image' | 'audio';
  mimeType: string;
  data: string;      // base64
  caption?: string;
  timestamp: number; // microseconds
 }
 /**
 * Blob store for binary attachments (images, audio) referenced by conversation messages.
 *
 * Unlike text messages (Redis hot + Iceberg cold), blobs write directly to S3 Parquet
 * on each turn — they're infrequent enough that per-turn files don't cause fragmentation.
 * Blob IDs are stored in the parent message's metadata field for later retrieval.
 */
 export class BlobStore {
  constructor(
    private icebergClient: IcebergClient | undefined,
    private logger: FastifyBaseLogger
  ) {}
  /**
   * Write all blobs for one assistant turn to a single S3 Parquet file.
   * Returns the blob IDs assigned. Failures are logged but do not throw.
   */
  async writeBlobs(
    userId: string,
    sessionId: string,
    messageId: string,
    blobs: Array<{ blobType: 'image' | 'audio'; mimeType: string; data: string; caption?: string }>
  ): Promise<string[]> {
    if (!this.icebergClient || blobs.length === 0) {
      return [];
    }
    const now = Date.now();
    const stored = blobs.map((b, i) => ({
      id: `blob_${userId}_${now}_${i}`,
      user_id: userId,
      session_id: sessionId,
      message_id: messageId,
      blob_type: b.blobType,
      mime_type: b.mimeType,
      data: b.data,
      caption: b.caption ?? null,
      timestamp: now * 1000, // microseconds
    }));
    try {
      await this.icebergClient.appendBlobs(userId, sessionId, messageId, stored);
      this.logger.info({ userId, sessionId, count: stored.length }, 'Blobs written to S3');
    } catch (error) {
      this.logger.error({ error }, 'Failed to write blobs to S3');
      // Don't throw — blob failure should not break the conversation turn
    }
    return stored.map(b => b.id);
  }
  /**
   * Retrieve blobs by their IDs from S3/Iceberg cold storage.
   */
  async getBlobsByIds(userId: string, sessionId: string, blobIds: string[]): Promise<StoredBlob[]> {
    if (!this.icebergClient || blobIds.length === 0) {
      return [];
    }
    try {
      const rows = await this.icebergClient.queryBlobs(userId, sessionId, blobIds);
      return rows.map(r => ({
        id: r.id,
        userId: r.user_id,
        sessionId: r.session_id,
        messageId: r.message_id,
        blobType: r.blob_type as 'image' | 'audio',
        mimeType: r.mime_type,
        data: r.data,
        caption: r.caption ?? undefined,
        timestamp: r.timestamp,
      }));
    } catch (error) {
      this.logger.error({ error, blobIds }, 'Failed to retrieve blobs');
      return [];
    }
  }
 }
--- a/gateway/src/harness/prompts/system-prompt.md
+++ b/gateway/src/harness/prompts/system-prompt.md
@@ -39,9 +39,9 @@ If the user asks for a capability not provided by Dexorder, decline and explain
 ## Task Delegation
 - For ANY research questions, deep analysis, statistical analysis, charting requests, or market data queries that require computation, you MUST use the 'research' tool
 - For ANYTHING related to indicators on the chart — reading, adding, removing, modifying, or creating custom indicators — you MUST use the 'indicator' tool
- For ANY backtesting request — running a strategy against historical data — you MUST use the 'backtest_strategy' tool directly; NEVER use the research tool for backtesting
+- For ANY request about trading strategies — writing, editing, backtesting, interpreting results, activating, deactivating, or monitoring — you MUST use the 'strategy' tool; NEVER write strategy Python code yourself
 - NEVER write Python code directly in your responses to the user
- NEVER show code to the user — delegate to the research or indicator tool instead
+- NEVER show code to the user — delegate to the research, indicator, or strategy tool instead
 - NEVER attempt to do analysis yourself — let the subagents handle it
 ## Available Tools
@@ -110,46 +110,54 @@ Parameters:
 - instruction: Natural language description of the analysis to perform (be specific!)
 - name: A unique name for the research script (e.g., "BTC Weekly Analysis")
 **Do NOT include any time range, history length, bar count, period size, resolution, or timestamp guidance in the instruction** — not as numbers, not as natural language ("3-6 months", "1 year", "sufficient data"), not at all. The research subagent has its own rules for selecting resolution and history window. If you add time guidance, the subagent will follow yours instead of its own (which uses much more data). Only pass time constraints if the user explicitly asked for a specific period (e.g. "last week", "show me 2023").
 Example usage:
 - User: "Does Friday price action correlate with Monday?"
 - You: Call research tool with instruction="Analyze correlation between Friday and Monday price action during NY trading hours (9:30-4:00 ET)", name="Friday-Monday Correlation"
 - WRONG: "...use hourly data and at least 3-6 months..."  ← never add this
 ### strategy
 **Use this tool for ALL trading strategy requests without exception.**
 The strategy subagent handles the complete strategy lifecycle: writing PandasStrategy classes, running backtests, interpreting results, and activating/deactivating paper trading.
 **ALWAYS use strategy for:**
 - "Create a strategy that buys when RSI < 30" → write a new strategy
 - "Edit my momentum strategy to use a tighter stop" → modify existing strategy
 - "Backtest my RSI strategy over the last year" → run backtest
 - "How did this strategy perform on BTC?" → interpret results
 - "Activate my strategy for paper trading" → start paper trading
 - "What strategies are running?" → list active strategies
 - "Stop my momentum strategy" → deactivate a strategy
 - Any question about a strategy's PnL, trades, or performance
 **NEVER call `backtest_strategy`, `activate_strategy`, `deactivate_strategy`, or `list_active_strategies` directly** — always go through the strategy tool.
 **Custom indicators in strategies:**
 When writing a new strategy, the strategy subagent will first check for existing custom indicators via `python_list(category="indicator")`. Prefer using custom indicators (via `ta.custom_*`) over computing signals inline — this promotes reuse and gives users better visibility into strategy components. If a needed indicator doesn't exist yet, the strategy subagent will create it first via the indicator workflow.
 ### backtest_strategy
-**ALWAYS use this tool — and ONLY this tool — for any backtesting request.**
+*(Called internally by the strategy tool — do not call this directly.)*
 Runs a saved trading strategy against historical OHLC data using the Nautilus Trader backtesting engine.
-Returns structured performance metrics and an equity curve. Any charts generated are automatically sent to the user.
+Returns structured performance metrics including trade list, Sortino/Calmar ratios, and equity curve.
 **ALWAYS use backtest_strategy for:**
 - "Backtest my RSI strategy over the last year"
 - "How did this strategy perform on BTC?"
 - "Run a backtest from January to June"
 - Any request to test or evaluate a strategy on historical data
 **NEVER use research for backtesting** — the research tool cannot run strategies through the backtesting engine.
 After the tool returns, summarize the results clearly: total return, Sharpe ratio, max drawdown, win rate, and trade count. Present the equity curve description in plain language.
 Parameters:
 - strategy_name: Display name of the saved strategy (use python_list with category="strategy" to check existing strategies)
 - feeds: Array of `{symbol, period_seconds}` feed objects (e.g. `[{"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600}]`)
 - from_time / to_time: Date strings ("2024-01-01", "90 days ago", "now") or Unix timestamps
 - initial_capital: Starting balance in quote currency (default 10,000)
 ### list_active_strategies
 *(Called internally by the strategy tool — do not call this directly.)*
 Lists all currently active (live or paper) strategies and their status.
 Use this when the user asks what strategies are running.
 ### python_list
 List existing scripts in a category ("strategy", "indicator", or "research").
 Use this before calling the research tool to check whether a relevant script already exists.
 If one does, pass its exact name to the research tool so the subagent updates it rather than creating a new one.
-Also use before calling backtest_strategy to confirm the strategy name.
+The strategy tool uses this internally to check strategy names before backtesting.
 ### symbol-lookup
 Look up trading symbols and get metadata.
 Use this when users mention tickers or need symbol information.
 **Always use symbol_lookup to resolve a proper ticker before passing it to the research or get-chart-data tools.** Symbols must be in `SYMBOL.EXCHANGE` format (e.g., `BTC/USDT.BINANCE`). If the user says "ETHUSDT", "ETH", or any ambiguous ticker, resolve it first with symbol_lookup so the correct formatted ticker is passed downstream.
 ### get-chart-data
 **IMPORTANT: This is for QUICK, CASUAL information ONLY. This tool just returns raw data - it does NOT create charts or plots.**
--- a/gateway/src/harness/subagents/base-subagent.ts
+++ b/gateway/src/harness/subagents/base-subagent.ts
@@ -7,6 +7,7 @@ import type { MCPClientConnector } from '../mcp-client.js';
 import type { DynamicStructuredTool } from '@langchain/core/tools';
 import { readFile } from 'fs/promises';
 import { join } from 'path';
 import type { HarnessEvent, SubagentChunkEvent, SubagentThinkingEvent } from '../harness-events.js';
 /**
 * Subagent configuration (loaded from config.yaml)
@@ -122,6 +123,65 @@ export abstract class BaseSubagent {
    yield result;
  }
  /**
   * Extract subagent_chunk / subagent_thinking events from a LangGraph `messages` stream datum.
   *
   * LangGraph emits `[message_chunk, metadata]` tuples in `messages` mode. The message content
   * can be a plain string (normal text token) or an array of content blocks (extended thinking
   * responses with `{type:"thinking", thinking:"..."}` and `{type:"text", text:"..."}`).
   */
  static extractStreamChunks(
    data: unknown,
    agentName: string,
  ): Array<SubagentChunkEvent | SubagentThinkingEvent> {
    const msg = Array.isArray(data) ? (data as unknown[])[0] : data;
    const content = (msg as any)?.content;
    if (typeof content === 'string') {
      return content ? [{ type: 'subagent_chunk', agentName, content }] : [];
    }
    if (Array.isArray(content)) {
      const chunks: Array<SubagentChunkEvent | SubagentThinkingEvent> = [];
      for (const block of content as any[]) {
        if (block?.type === 'thinking' && typeof block.thinking === 'string' && block.thinking) {
          chunks.push({ type: 'subagent_thinking', agentName, content: block.thinking });
        } else if (block?.type === 'text' && typeof block.text === 'string' && block.text) {
          chunks.push({ type: 'subagent_chunk', agentName, content: block.text });
        }
      }
      return chunks;
    }
    return [];
  }
  /**
   * Extract the final text from an `updates`-mode agent message.
   * Handles both plain string content and array content blocks (extended thinking).
   */
  static extractFinalText(msg: any): string {
    if (typeof msg?.content === 'string') return msg.content;
    if (Array.isArray(msg?.content)) {
      return (msg.content as any[])
        .filter((b: any) => b?.type === 'text' && typeof b.text === 'string')
        .map((b: any) => b.text as string)
        .join('');
    }
    return '';
  }
  /**
   * Stream typed HarnessEvents during execution.
   * Subclasses override this to emit subagent_chunk / subagent_tool_call events
   * using agent.stream() from LangGraph. Default falls back to execute().
   */
  async *streamEvents(
    context: SubagentContext,
    input: string,
    _signal?: AbortSignal,
  ): AsyncGenerator<HarnessEvent, string> {
    const result = await this.execute(context, input);
    return result;
  }
  /**
   * Build messages with system prompt and memory context
   */
--- a/gateway/src/harness/subagents/index.ts
+++ b/gateway/src/harness/subagents/index.ts
@@ -11,3 +11,8 @@ export {
  createResearchSubagent,
  type ResearchResult,
 } from './research/index.js';
 export {
  StrategySubagent,
  createStrategySubagent,
 } from './strategy/index.js';
--- a/gateway/src/harness/subagents/indicator/index.ts
+++ b/gateway/src/harness/subagents/indicator/index.ts
@@ -4,6 +4,7 @@ import { SystemMessage } from '@langchain/core/messages';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import type { FastifyBaseLogger } from 'fastify';
 import type { MCPClientConnector } from '../../mcp-client.js';
 import type { HarnessEvent } from '../../harness-events.js';
 /**
 * Indicator Subagent
@@ -84,6 +85,56 @@ export class IndicatorSubagent extends BaseSubagent {
    return finalText;
  }
  async *streamEvents(context: SubagentContext, instruction: string, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
    this.logger.info({ subagent: this.getName() }, 'streamEvents starting');
    if (!this.hasMCPClient()) {
      throw new Error('MCP client not available for indicator subagent');
    }
    const initialMessages = this.buildMessages(context, instruction);
    const systemMessage = initialMessages[0];
    const humanMessage = initialMessages[initialMessages.length - 1];
    const agent = createReactAgent({
      llm: this.model,
      tools: this.tools,
      prompt: systemMessage as SystemMessage,
    });
    const stream = agent.stream(
      { messages: [humanMessage] },
      { streamMode: ['messages', 'updates'], recursionLimit: 25, signal }
    );
    let finalText = '';
    for await (const [mode, data] of await stream) {
      if (signal?.aborted) break;
      if (mode === 'messages') {
        for (const chunk of IndicatorSubagent.extractStreamChunks(data, this.config.name)) {
          yield chunk;
        }
      } else if (mode === 'updates') {
        if ((data as any).agent?.messages) {
          for (const msg of (data as any).agent.messages as any[]) {
            if (msg.tool_calls?.length) {
              for (const tc of msg.tool_calls) {
                yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: tc.name, label: tc.name };
              }
            } else {
              const content = IndicatorSubagent.extractFinalText(msg);
              if (content) finalText = content;
            }
          }
        }
      }
    }
    this.logger.info({ textLength: finalText.length }, 'streamEvents finished');
    return finalText;
  }
 }
 /**
--- a/gateway/src/harness/subagents/research/index.ts
+++ b/gateway/src/harness/subagents/research/index.ts
@@ -4,6 +4,7 @@ import { SystemMessage } from '@langchain/core/messages';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import type { FastifyBaseLogger } from 'fastify';
 import type { MCPClientConnector } from '../../mcp-client.js';
 import type { HarnessEvent } from '../../harness-events.js';
 /**
 * Result from research subagent execution
@@ -50,6 +51,58 @@ export class ResearchSubagent extends BaseSubagent {
    this.imageCapture = capture;
  }
  /**
   * Fetch custom indicators from the sandbox and return a formatted system prompt section.
   * Returns empty string if there are no custom indicators or the call fails.
   */
  private async fetchCustomIndicatorsSection(): Promise<string> {
    try {
      const raw = await this.callMCPTool('python_list', { category: 'indicator' });
      const r = raw as any;
      const text = r?.content?.[0]?.text ?? r?.[0]?.text;
      const parsed = typeof text === 'string' ? JSON.parse(text) : raw;
      const items: any[] = parsed?.items ?? [];
      if (items.length === 0) return '';
      const lines: string[] = ['\n\n## Custom Indicators\n'];
      lines.push('The user has defined the following custom indicators. Use `ta.custom_<name>` where `<name>` is the lowercase sanitized function name shown below.\n');
      for (const item of items) {
        const displayName: string = item.name ?? 'unknown';
        const description: string = item.description ?? '';
        const meta: any = item.metadata ?? {};
        // Derive the ta attribute name: sanitize display name to lowercase + underscores
        const taAttr = `custom_${displayName.toLowerCase().replace(/[^\w]/g, '_').replace(/_+/g, '_').replace(/^_+|_+$/g, '')}`;
        const inputSeries: string[] = meta.input_series ?? ['close'];
        const params: Record<string, any> = meta.parameters ?? {};
        const pane: string = meta.pane ?? 'separate';
        const inputStr = inputSeries.map((s: string) => `df['${s}']`).join(', ');
        const paramStr = Object.entries(params)
          .map(([k, v]: [string, any]) => `${k}=${JSON.stringify(v?.default ?? null)}`)
          .join(', ');
        const callExample = paramStr
          ? `ta.${taAttr}(${inputStr}, ${paramStr})`
          : `ta.${taAttr}(${inputStr})`;
        const outputNames = (meta.output_columns ?? [{ name: 'value' }])
          .map((c: any) => c.name)
          .join(', ');
        lines.push(`### ${displayName}`);
        if (description) lines.push(description);
        lines.push(`- **Call**: \`${callExample}\``);
        lines.push(`- **Outputs**: ${outputNames}  |  **Pane**: ${pane}`);
        lines.push('');
      }
      return lines.join('\n');
    } catch (err) {
      this.logger.warn({ err }, 'Failed to fetch custom indicators for prompt injection');
      return '';
    }
  }
  /**
   * Execute research request using LangGraph's createReactAgent.
   * This is the standard LangChain pattern for agents with tool access —
@@ -79,11 +132,17 @@ export class ResearchSubagent extends BaseSubagent {
    this.imageCapture.length = 0;
    this.lastImages = [];
    const customIndicatorsSection = await this.fetchCustomIndicatorsSection();
    // Build system prompt (with memory context appended)
    const initialMessages = this.buildMessages(context, instruction);
    // buildMessages returns [SystemMessage, ...history, HumanMessage]
    // Extract system content for createReactAgent's prompt parameter
-    const systemMessage = initialMessages[0];
+    let systemMessage = initialMessages[0] as SystemMessage;
    if (customIndicatorsSection) {
      const base = typeof systemMessage.content === 'string' ? systemMessage.content : JSON.stringify(systemMessage.content);
      systemMessage = new SystemMessage(base + customIndicatorsSection);
    }
    const humanMessage = initialMessages[initialMessages.length - 1];
    // createReactAgent is the standard LangChain/LangGraph pattern for tool-using agents.
@@ -91,12 +150,12 @@ export class ResearchSubagent extends BaseSubagent {
    const agent = createReactAgent({
      llm: this.model,
      tools: this.tools,
-      prompt: systemMessage as SystemMessage,
+      prompt: systemMessage,
    });
    const result = await agent.invoke(
      { messages: [humanMessage] },
-      { recursionLimit: 20 }
+      { recursionLimit: 40 }
    );
    // The final message in the graph output is the agent's last AIMessage
@@ -146,6 +205,109 @@ export class ResearchSubagent extends BaseSubagent {
    return this.lastImages;
  }
  /**
   * Stream typed HarnessEvents using LangGraph's agent.stream().
   * Emits subagent_tool_call when tools fire, subagent_chunk for the final AI response.
   * Returns the final text string as the generator return value.
   */
  async *streamEvents(context: SubagentContext, instruction: string, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
    this.logger.info({ subagent: this.getName() }, 'streamEvents starting');
    if (!this.hasMCPClient()) {
      throw new Error('MCP client not available for research subagent');
    }
    this.imageCapture.length = 0;
    this.lastImages = [];
    // Emit immediately so the UI shows the subagent has started — LLM generation
    // can take minutes with non-streaming models and nothing else reaches the UI until
    // the first `updates` event fires (after the LLM finishes its first response).
    yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: 'Thinking...', label: 'Thinking...' };
    const customIndicatorsSection = await this.fetchCustomIndicatorsSection();
    const initialMessages = this.buildMessages(context, instruction);
    let systemMessage = initialMessages[0] as SystemMessage;
    if (customIndicatorsSection) {
      const base = typeof systemMessage.content === 'string' ? systemMessage.content : JSON.stringify(systemMessage.content);
      systemMessage = new SystemMessage(base + customIndicatorsSection);
    }
    const humanMessage = initialMessages[initialMessages.length - 1];
    const agent = createReactAgent({
      llm: this.model,
      tools: this.tools,
      prompt: systemMessage,
    });
    this.logger.debug(
      { toolCount: this.tools.length, toolNames: this.tools.map(t => t.name) },
      'Research subagent: starting stream with tools'
    );
    const systemChars = typeof systemMessage.content === 'string'
      ? systemMessage.content.length
      : JSON.stringify(systemMessage.content).length;
    const humanChars = typeof humanMessage.content === 'string'
      ? humanMessage.content.length
      : JSON.stringify(humanMessage.content).length;
    this.logger.info(
      { systemChars, humanChars, approxInputKB: Math.round((systemChars + humanChars) / 1024) },
      'Research subagent: input context size'
    );
    const stream = agent.stream(
      { messages: [humanMessage] },
      { streamMode: ['messages', 'updates'], recursionLimit: 40, signal }
    );
    let finalText = '';
    let updateCount = 0;
    for await (const [mode, data] of await stream) {
      if (signal?.aborted) break;
      if (mode === 'messages') {
        // Real-time token streaming from the LLM — data is [BaseMessage, metadata]
        for (const chunk of ResearchSubagent.extractStreamChunks(data, this.config.name)) {
          yield chunk;
        }
      } else if (mode === 'updates') {
        updateCount++;
        const updateKeys = Object.keys(data as any);
        this.logger.debug({ updateCount, updateKeys }, 'Research subagent: graph update');
        // Agent node fired — yield tool call decisions before tools run
        if ((data as any).agent?.messages) {
          for (const msg of (data as any).agent.messages as any[]) {
            if (msg.tool_calls?.length) {
              for (const tc of msg.tool_calls) {
                yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: tc.name, label: tc.name };
              }
            } else {
              // Capture final text for return value (already streamed via messages above)
              const content = ResearchSubagent.extractFinalText(msg);
              if (content) finalText = content;
            }
          }
        }
      }
    }
    this.lastImages = [...this.imageCapture];
    if (!finalText) {
      this.logger.warn(
        { imageCount: this.lastImages.length },
        'Research subagent: model returned empty output'
      );
    } else {
      this.logger.info(
        { textLength: finalText.length, imageCount: this.lastImages.length },
        'streamEvents finished'
      );
    }
    return finalText;
  }
  /**
   * Stream research execution
   */
--- a/gateway/src/harness/subagents/research/memory/api-reference.md
+++ b/gateway/src/harness/subagents/research/memory/api-reference.md
@@ -421,6 +421,7 @@ For research scripts, import and use get_api() to access the API:
 """
 import logging
 import threading
 from typing import Optional
 from dexorder.api.api import API
@@ -432,10 +433,13 @@ log = logging.getLogger(__name__)
 # Global API instance - managed by main.py
 _global_api: Optional[API] = None
 # Thread-local API — used by harness threads so they don't overwrite the global
 _thread_local = threading.local()
 def get_api() -> API:
    """
-    Get the global API instance for accessing market data and charts.
+    Get the API instance for accessing market data and charts.
    Use this in research scripts to access the data and charting APIs.
@@ -462,15 +466,27 @@ def get_api() -> API:
        # Create chart
        fig, ax = api.charting.plot_ohlc(df, title="BTC/USDT")
    """
    # Thread-local takes priority (set by harness threads)
    api = getattr(_thread_local, 'api', None)
    if api is not None:
        return api
    if _global_api is None:
        raise RuntimeError("API not initialized")
    return _global_api
 def set_api(api: API) -> None:
-    """Set the global API instance. Internal use only."""
+    """Set the API instance.
-    global _global_api
+
-    _global_api = api
+    When called from the main thread, sets the global API used by all threads.
    When called from a non-main thread (e.g. harness threads), sets a thread-local
    API so the global is not overwritten.
    """
    if threading.current_thread() is threading.main_thread():
        global _global_api
        _global_api = api
    else:
        _thread_local.api = api
 __all__ = ['API', 'ChartingAPI', 'DataAPI', 'get_api', 'set_api']
--- a/gateway/src/harness/subagents/research/memory/usage-examples.md
+++ b/gateway/src/harness/subagents/research/memory/usage-examples.md
@@ -28,11 +28,12 @@ from datetime import datetime
 api = get_api()
 # Method 1: Using Unix timestamps (seconds)
 # 1609459200 = 2021-01-01, 1735689600 = 2025-01-01
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,  # 1 hour candles
-    start_time=1640000000,  # Unix timestamp in seconds
+    start_time=1609459200,  # 2021-01-01
-    end_time=1640086400,
+    end_time=1735689600,    # 2025-01-01 (~4 years, ~35,000 bars)
    extra_columns=["volume"]
 ))
@@ -40,8 +41,8 @@ df = asyncio.run(api.data.historical_ohlc(
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time="2021-12-20",  # Simple date string
+    start_time="2021-01-01",
-    end_time="2021-12-21",
+    end_time="2025-01-01",   # ~4 years of 1h bars ≈ 35,000 bars
    extra_columns=["volume"]
 ))
@@ -49,21 +50,24 @@ df = asyncio.run(api.data.historical_ohlc(
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time="2021-12-20 00:00:00",
+    start_time="2021-01-01 00:00:00",
-    end_time="2021-12-20 23:59:59",
+    end_time="2025-01-01 00:00:00",
    extra_columns=["volume"]
 ))
 # Method 4: Using datetime objects
 from datetime import datetime, timedelta
 end_time = datetime.now()
 start_time = end_time - timedelta(days=4*365)  # 4 years back
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time=datetime(2021, 12, 20),
+    start_time=start_time,
-    end_time=datetime(2021, 12, 21),
+    end_time=end_time,
    extra_columns=["volume"]
 ))
-print(f"Loaded {len(df)} candles")
+print(f"Loaded {len(df)} candles from {df.index[0]} to {df.index[-1]}")
 print(df.head())
 ```
@@ -94,8 +98,8 @@ api = get_api()
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time="2021-12-20",
+    start_time="2021-01-01",
-    end_time="2021-12-21",
+    end_time="2025-01-01",   # ~4 years of 1h bars
    extra_columns=["volume"]
 ))
@@ -125,8 +129,8 @@ api = get_api()
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,
-    start_time="2021-12-20",
+    start_time="2021-01-01",
-    end_time="2021-12-21"
+    end_time="2025-01-01"
 ))
 # Calculate indicators using pandas-ta
@@ -190,14 +194,19 @@ import pandas_ta as ta
 # Get API instance
 api = get_api()
-# Fetch historical data using date strings (easiest for research)
+# Fetch historical data — use max history for research (target 100k-200k bars)
 from datetime import datetime, timedelta
 end_time = datetime.now()
 start_time = end_time - timedelta(days=3*365)  # 3 years of 1h bars ≈ 26,000 bars
 df = asyncio.run(api.data.historical_ohlc(
    ticker="BTC/USDT.BINANCE",
    period_seconds=3600,  # 1 hour
-    start_time="2021-12-20",
+    start_time=start_time,
-    end_time="2021-12-21",
+    end_time=end_time,
    extra_columns=["volume"]
 ))
 print(f"[Data] {len(df)} bars | {df.index[0]} → {df.index[-1]} | period=3600s")
 # Add moving averages using pandas-ta
 df['sma_20'] = ta.sma(df['close'], length=20)
@@ -218,7 +227,7 @@ ax.plot(range(len(df)), df['ema_50'], label="EMA 50", color="red", linewidth=1.5
 ax.legend()
 # Print summary statistics
-print(f"Period: {len(df)} candles")
+print(f"[Data] {len(df)} bars | {df.index[0]} → {df.index[-1]} | period=3600s")
 print(f"High: {df['high'].max()}")
 print(f"Low: {df['low'].min()}")
 print(f"Mean Volume: {df['volume'].mean():.2f}")
--- a/gateway/src/harness/subagents/research/system-prompt.md
+++ b/gateway/src/harness/subagents/research/system-prompt.md
@@ -10,6 +10,33 @@ Create Python scripts that:
 - Generate professional charts using matplotlib via the ChartingAPI
 - All matplotlib figures are automatically captured and sent to the user as images
 ## Data Selection: Resolution and Time Window
 > **Rule**: Every research script must fetch the maximum useful history — target 100,000–200,000 bars, hard cap at 5 years. **Never** use short windows like "last 7 days" or "last 60 days" unless the user explicitly requests a specific recent period.
 Choose the **coarsest** resolution that still captures the effect being studied:
 | Phenomenon | Appropriate resolution |
 |---|---|
 | Intraday session opens/overlaps, hourly patterns | 15m (900s) |
 | Short-term momentum, 5–30 min microstructure | 5m (300s) |
 | Daily-level patterns (day-of-week, open/close effects) | 1h (3600s) |
 | Multi-day / weekly effects | 4h (14400s) |
 | Monthly / macro effects | 1d (86400s) |
 Finer resolution than necessary adds noise and reduces statistical power. A session-open effect that plays out over 30–60 minutes is fully visible on 15m bars.
 Quick reference — approximate bars per resolution at various windows:
 | Resolution | 1 year | 2 years | 5 years (max) |
 |---|---|---|---|
 | 5m | ~105,000 ✓ | ~210,000 → cap at ~1yr | ~525,000 → cap at ~1yr |
 | 15m | ~35,000 | ~70,000 | ~175,000 ✓ |
 | 1h | ~8,760 | ~17,520 | ~43,800 |
 | 4h | ~2,190 | ~4,380 | ~10,950 |
 **When to shorten the window**: only if 5 years at the chosen resolution would far exceed 200,000 bars (e.g., 5m over 5 years ≈ 525k → shorten to ~2 years). Otherwise always use the full 5 years.
 ## Available Tools
 You have direct access to these MCP tools:
@@ -17,13 +44,15 @@ You have direct access to these MCP tools:
 - **python_write**: Create a new script (research, strategy, or indicator category)
  - Required: category, name, description, code
  - Optional: metadata (category-specific fields — see below)
-  - For research: automatically executes the script after writing
+  - **For research**: fully executes the script and returns all output (stdout, stderr) and captured chart images. The response IS the execution result — **do not call `execute_research` afterward**.
-  - Returns validation results and execution output (text + images)
+  - **For indicator/strategy**: runs against synthetic test data to catch compile/runtime errors; no chart images are generated.
  - Returns validation results and execution output (text + images for research)
 - **python_edit**: Update an existing script
  - Required: category, name
  - Optional: code, description, metadata
-  - For research: automatically re-executes if code is updated
+  - **For research**: re-executes the script when code is changed and returns all output and images. **Do not call `execute_research` afterward**.
  - **For indicator/strategy**: re-runs the validation test only.
  - Returns validation results and execution output
 - **python_read**: Read an existing research script
@@ -32,8 +61,9 @@ You have direct access to these MCP tools:
 - **python_list**: List all research scripts
  - Returns: array of {name, description, metadata}
- **execute_research**: Manually run a research script
+- **execute_research**: Run a research script that already exists on disk
-  - Note: Usually not needed since write/edit auto-execute
+  - Use this **only** when the user explicitly asks to re-run a script, or to run a script that was written in a previous session and already exists
  - **Do not call this after `python_write` or `python_edit`** — those tools already executed the script and returned its output
  - Returns: text output and images
 ## Research Script API
@@ -55,180 +85,8 @@ See your knowledge base for complete API documentation, examples, and the full p
 ## Technical Indicators — pandas-ta
-The sandbox environment uses **pandas-ta** as the standard indicator library. Always use it for technical indicator calculations; do not write manual rolling/ewm implementations.
+Use `import pandas_ta as ta` for all indicator calculations. Never write manual rolling/ewm implementations. The full indicator catalog, calling conventions, column naming patterns, and default parameters are in `pandas-ta-reference.md` in your knowledge base.
 ```python
 import pandas_ta as ta
 ```
 ### Calling Convention
 pandas-ta functions accept a Series (or OHLCV columns) plus keyword parameters that match pandas-ta's documented argument names:
 ```python
 # Single-series indicator
 rsi = ta.rsi(df['close'], length=14)          # returns Series
 # OHLCV indicator
 atr = ta.atr(df['high'], df['low'], df['close'], length=14)
 # Multi-output indicator (returns DataFrame)
 macd_df = ta.macd(df['close'], fast=12, slow=26, signal=9)
 # columns: MACD_12_26_9, MACDh_12_26_9, MACDs_12_26_9
 bbands_df = ta.bbands(df['close'], length=20, std=2.0)
 # columns: BBL_20_2.0, BBM_20_2.0, BBU_20_2.0, BBB_20_2.0, BBP_20_2.0
 ```
 ### Available Indicators (canonical list)
 These match the indicators supported by the TradingView web client. Use the pandas-ta function name shown here (lowercase):
 **Overlap / Moving Averages** — plotted on the price pane
 | Function | Description |
 |----------|-------------|
 | `sma` | Simple Moving Average — plain arithmetic mean over `length` periods |
 | `ema` | Exponential Moving Average — more weight on recent prices |
 | `wma` | Weighted Moving Average — linearly increasing weights |
 | `dema` | Double EMA — two layers of EMA to reduce lag |
 | `tema` | Triple EMA — three layers of EMA, even less lag than DEMA |
 | `trima` | Triangular MA — double-smoothed SMA, very smooth |
 | `kama` | Kaufman Adaptive MA — adapts speed to market noise/trending conditions |
 | `t3` | T3 Moving Average — Tillson's smooth, low-lag MA using six EMAs |
 | `hma` | Hull MA — very low-lag MA using WMAs |
 | `alma` | Arnaud Legoux MA — Gaussian-weighted MA with reduced lag and noise |
 | `midpoint` | Midpoint of close over `length` periods: (highest + lowest) / 2 |
 | `midprice` | Midpoint of high/low over `length` periods |
 | `supertrend` | Trend-following band (ATR-based) that flips above/below price |
 | `ichimoku` | Ichimoku Cloud — multi-line Japanese trend/support/resistance system |
 | `vwap` | Volume-Weighted Average Price — average price weighted by volume, resets on `anchor` |
 | `vwma` | Volume-Weighted MA — like SMA but candles weighted by volume |
 | `bbands` | Bollinger Bands — SMA ± N standard deviations; returns upper, mid, lower bands |
 **Momentum** — typically plotted in a separate pane
 | Function | Description |
 |----------|-------------|
 | `rsi` | Relative Strength Index — 0–100 oscillator measuring speed of price changes |
 | `macd` | MACD — difference of two EMAs plus signal line and histogram |
 | `stoch` | Stochastic Oscillator — %K/%D, measures close vs recent high/low range |
 | `stochrsi` | Stochastic RSI — applies stochastic formula to RSI values |
 | `cci` | Commodity Channel Index — deviation of price from its statistical mean |
 | `willr` | Williams %R — inverse stochastic, −100 to 0 oscillator |
 | `mom` | Momentum — raw price change over `length` periods |
 | `roc` | Rate of Change — percentage price change over `length` periods |
 | `trix` | TRIX — 1-period % change of a triple-smoothed EMA |
 | `cmo` | Chande Momentum Oscillator — ratio of up/down momentum, −100 to 100 |
 | `adx` | Average Directional Index — strength of trend (0–100, direction-agnostic) |
 | `aroon` | Aroon — measures how recently the highest/lowest price occurred; returns Up, Down, Oscillator |
 | `ao` | Awesome Oscillator — difference of 5- and 34-period simple MAs of midprice |
 | `bop` | Balance of Power — measures buying vs selling pressure: (close−open)/(high−low) |
 | `uo` | Ultimate Oscillator — weighted combo of three period (fast/medium/slow) buying pressure ratios |
 | `apo` | Absolute Price Oscillator — difference between two EMAs (like MACD without signal line) |
 | `mfi` | Money Flow Index — RSI-like oscillator using price × volume |
 | `coppock` | Coppock Curve — long-term momentum oscillator based on rate-of-change |
 | `dpo` | Detrended Price Oscillator — removes trend to show cycle oscillations |
 | `fisher` | Fisher Transform — converts price into a Gaussian normal distribution |
 | `rvgi` | Relative Vigor Index — compares close−open to high−low to measure trend vigor |
 | `kst` | Know Sure Thing — momentum oscillator from four ROC periods, smoothed |
 **Volatility** — plotted on price pane or separate
 | Function | Description |
 |----------|-------------|
 | `atr` | Average True Range — average of true range (greatest of H−L, H−prevC, L−prevC) |
 | `kc` | Keltner Channels — EMA ± N × ATR bands around price |
 | `donchian` | Donchian Channels — highest high / lowest low over `length` periods |
 **Volume** — plotted in separate pane
 | Function | Description |
 |----------|-------------|
 | `obv` | On Balance Volume — cumulative volume, added on up days, subtracted on down days |
 | `ad` | Accumulation/Distribution — running total of the money flow multiplier × volume |
 | `adosc` | Chaikin Oscillator — EMA difference of the A/D line |
 | `cmf` | Chaikin Money Flow — sum of (money flow volume) / sum of volume over `length` |
 | `eom` | Ease of Movement — relates price change to volume; high = price moves easily |
 | `efi` | Elder's Force Index — combines price change direction with volume magnitude |
 | `kvo` | Klinger Volume Oscillator — EMA difference of volume force |
 | `pvt` | Price Volume Trend — cumulative: volume × percentage price change |
 **Statistics / Price Transforms**
 | Function | Description |
 |----------|-------------|
 | `stdev` | Standard Deviation of close over `length` periods |
 | `linreg` | Linear Regression Curve — least-squares line endpoint value over `length` periods |
 | `slope` | Linear Regression Slope — gradient of the regression line |
 | `hl2` | Median Price — (high + low) / 2 |
 | `hlc3` | Typical Price — (high + low + close) / 3 |
 | `ohlc4` | Average Price — (open + high + low + close) / 4 |
 **Trend**
 | Function | Description |
 |----------|-------------|
 | `psar` | Parabolic SAR — trailing stop-and-reverse dots that follow price |
 | `vortex` | Vortex Indicator — VI+ / VI− lines measuring upward vs downward trend movement |
 | `chop` | Choppiness Index — 0–100, high = choppy/sideways, low = strong trend |
 ### Default Parameters
 Key defaults to keep in mind:
 - Most period/length indicators: `length=14` (use `length=` not `timeperiod=`)
 - `bbands`: `length=20, std=2.0` (note: single `std`, not separate upper/lower)
 - `macd`: `fast=12, slow=26, signal=9`
 - `stoch`: `k=14, d=3, smooth_k=3`
 - `psar`: `af0=0.02, af=0.02, max_af=0.2`
 - `vwap`: `anchor='D'` (requires DatetimeIndex)
 - `ichimoku`: `tenkan=9, kijun=26, senkou=52`
 For multi-output indicator column extraction patterns and complete charting examples, fetch `pandas-ta-reference.md` from your knowledge base.
 ## Strategy Metadata Format
 When writing or editing a strategy (`category="strategy"`), always include a `metadata` object with:
 - **`data_feeds`** — list of feed descriptors the strategy requires:
  ```json
  [
    {"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600, "description": "Primary BTC/USDT hourly feed"},
    {"symbol": "ETH/USDT.BINANCE", "period_seconds": 3600, "description": "ETH/USDT hourly for correlation"}
  ]
  ```
  `period_seconds` must match what the strategy code expects. Use the same values when calling `backtest_strategy`.
 - **`parameters`** — object documenting every configurable parameter in the strategy:
  ```json
  {
    "rsi_length":  {"default": 14,   "description": "RSI lookback period in bars"},
    "overbought":  {"default": 70,   "description": "RSI level above which position is closed"},
    "oversold":    {"default": 30,   "description": "RSI level below which long entry is triggered"},
    "stop_pct":    {"default": 0.02, "description": "Stop-loss as a fraction of entry price (e.g. 0.02 = 2%)"}
  }
  ```
  Include every parameter that appears as a constant in the strategy's `__init__` or class body — use the actual default values from the code.
 Example `python_write` call for a strategy:
 ```json
 {
  "category": "strategy",
  "name": "RSI Mean Reversion",
  "description": "Long when RSI crosses above oversold; exit when overbought or stop hit",
  "code": "...",
  "metadata": {
    "data_feeds": [
      {"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600, "description": "BTC/USDT hourly OHLCV + order flow"}
    ],
    "parameters": {
      "rsi_length": {"default": 14, "description": "RSI lookback period"},
      "overbought":  {"default": 70, "description": "Exit long above this RSI level"},
      "oversold":    {"default": 30, "description": "Enter long below this RSI level"}
    }
  }
 }
 ```
 ## Coding Loop Pattern
@@ -244,11 +102,11 @@ When a user requests analysis:
   - Use appropriate ticker symbols, time ranges, and periods
   - The script will auto-execute after writing
-4. **Check execution results**: The tool returns:
+4. **Check execution results**: The tool returns the execution result directly — this is the script's actual output:
-   - `validation.success`: Whether script ran without errors
+   - `success`: Whether the script ran without errors
-   - `validation.output`: Any stdout/stderr text output
+   - Text output from stdout/stderr is visible to you
-   - `execution.content`: Array of text and image results
+   - Chart images are captured and sent to the user (you cannot see them)
-   - Note: Images are NOT included in your context - only text output is visible to you
+   - **Do NOT call `execute_research` after this step** — the script has already run and the results are in the response above
 5. **Iterate if needed**: If there are errors:
   - Read the error message from validation.output or execution text
@@ -259,8 +117,28 @@ When a user requests analysis:
   - The user will receive both your text response AND the chart images
   - Don't try to describe the images in detail - the user can see them
 ## Ticker Format
 All tickers passed to `api.data.historical_ohlc()` and other data methods **must** use the `SYMBOL.EXCHANGE` format, e.g.:
 - `BTC/USDT.BINANCE`
 - `ETH/USDT.BINANCE`
 - `SOL/USDT.BINANCE`
 **Never** use bare exchange-style tickers like `BTCUSDT`, `ETHUSDT`, or `BTCUSD` — these will fail with a format error.
 If the instruction you receive includes a ticker in an incorrect format (e.g., `ETHUSDT`), convert it to the proper format (`ETH/USDT.BINANCE`) before writing the script. When in doubt about which exchange to use, default to `BINANCE`.
 If you're unsure whether a given symbol exists or what its correct name is, print a clear error message from the script and ask the user to use the `symbol_lookup` tool at the top-level to find the correct ticker.
 ## Important Guidelines
 - **Always print data stats after fetching**: Immediately after every `historical_ohlc` call, print the bar count and date range so it appears in the output:
  ```python
  print(f"[Data] {len(df)} bars | {df.index[0]} → {df.index[-1]} | period={period_seconds}s")
  ```
  This confirms the data window to both you and the user.
 - **Images are pass-through only**: Chart images go directly to the user. You only see text output (print statements, errors). Don't try to analyze or describe images you can't see.
 - **Async data fetching**: All `api.data` methods are async. Always use `asyncio.run()`:
@@ -268,15 +146,6 @@ When a user requests analysis:
  df = asyncio.run(api.data.historical_ohlc(...))
  ```
 - **Charting is sync**: All `api.charting` methods are synchronous:
  ```python
  fig, ax = api.charting.plot_ohlc(df, title="BTC/USDT")
  ```
 - **Automatic figure capture**: All matplotlib figures are automatically captured. Don't save manually.
 - **Print for debugging**: Use `print()` statements for debugging - you'll see this output.
 - **Package management**: If script needs packages beyond base environment (pandas, numpy, matplotlib):
  - Add `conda_packages: ["package-name"]` to metadata
  - Packages are auto-installed during validation
@@ -287,16 +156,18 @@ When a user requests analysis:
 ## Example Workflow
-User: "Show me BTC price action for the last 7 days with volume"
+User: "Show me BTC/ETH price correlation over time"
 You:
-1. Call `python_write` with:
+1. Identify timescale: daily return correlation → 1h bars are sufficient
-   - name: "BTC 7-Day Price Action"
+2. Compute window: 1h bars × 5 years ≈ 43,800 bars (under 100k, but 5yr is the hard max — use it)
-   - description: "BTC/USDT price and volume analysis for the last 7 days"
+3. Call `python_write` with:
-   - code: (Python script that fetches data and creates chart)
+   - name: "BTC ETH Price Correlation"
-2. Check execution results
+   - description: "Rolling correlation of BTC/USDT and ETH/USDT daily returns using 5 years of 1h data"
-3. If successful, respond: "I've created a 7-day BTC price chart with volume analysis. The chart shows [brief summary of what the script does]."
+   - code: (Python script fetching 5yr of 1h OHLC for both tickers and plotting rolling correlation)
-4. User receives: Your text response + the actual chart image
+4. Check execution results
 5. If successful, respond with a brief summary of what the script does
 6. User receives: Your text response + the chart image
 ## Response Format
--- a/gateway/src/harness/subagents/strategy/config.yaml
+++ b/gateway/src/harness/subagents/strategy/config.yaml
@@ -0,0 +1,37 @@
 name: strategy
 description: Writes and manages PandasStrategy classes, runs backtests, and manages strategy activation
 # Model configuration
 model: claude-sonnet-4-6
 temperature: 0.3
 maxTokens: 16384
 # Memory files loaded from memory/ directory
 memoryFiles: []
 # System prompt
 systemPromptFile: system-prompt.md
 # Capabilities
 capabilities:
  - strategy_writing
  - backtesting
  - strategy_lifecycle
 # Tools available to this subagent
 tools:
  platform: []
  mcp:
    - python_write
    - python_edit
    - python_read
    - python_list
    - python_log
    - python_revert
    - backtest_strategy
    - activate_strategy
    - deactivate_strategy
    - list_active_strategies
    - get_backtest_results
    - get_strategy_trades
    - get_strategy_events
--- a/gateway/src/harness/subagents/strategy/index.ts
+++ b/gateway/src/harness/subagents/strategy/index.ts
@@ -0,0 +1,159 @@
 import { BaseSubagent, type SubagentConfig, type SubagentContext } from '../base-subagent.js';
 import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
 import { SystemMessage } from '@langchain/core/messages';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import type { FastifyBaseLogger } from 'fastify';
 import type { MCPClientConnector } from '../../mcp-client.js';
 import type { HarnessEvent } from '../../harness-events.js';
 /**
 * Strategy Subagent
 *
 * Specialized agent for writing PandasStrategy classes, running backtests,
 * and managing strategy activation/deactivation.
 *
 * Mirrors the pattern of IndicatorSubagent in indicator/index.ts.
 */
 export class StrategySubagent extends BaseSubagent {
  constructor(
    config: SubagentConfig,
    model: BaseChatModel,
    logger: FastifyBaseLogger,
    mcpClient?: MCPClientConnector,
    tools?: any[]
  ) {
    super(config, model, logger, mcpClient, tools);
  }
  /**
   * Execute a strategy request using LangGraph's createReactAgent.
   */
  async execute(context: SubagentContext, instruction: string): Promise<string> {
    this.logger.info(
      {
        subagent: this.getName(),
        userId: context.userContext.userId,
        instruction: instruction.substring(0, 200),
        toolCount: this.tools.length,
        toolNames: this.tools.map(t => t.name),
      },
      'Strategy subagent starting'
    );
    if (!this.hasMCPClient()) {
      throw new Error('MCP client not available for strategy subagent');
    }
    if (this.tools.length === 0) {
      this.logger.warn('Strategy subagent has no tools');
    }
    const initialMessages = this.buildMessages(context, instruction);
    const systemMessage = initialMessages[0];
    const humanMessage = initialMessages[initialMessages.length - 1];
    const agent = createReactAgent({
      llm: this.model,
      tools: this.tools,
      prompt: systemMessage as SystemMessage,
    });
    const result = await agent.invoke(
      { messages: [humanMessage] },
      { recursionLimit: 30 }
    );
    const allMessages: any[] = result.messages ?? [];
    this.logger.info(
      { messageCount: allMessages.length },
      'Strategy subagent graph completed'
    );
    const lastAI = [...allMessages].reverse().find(
      (m: any) => m.constructor?.name === 'AIMessage' || m._getType?.() === 'ai'
    );
    const finalText = lastAI
      ? (typeof lastAI.content === 'string' ? lastAI.content : JSON.stringify(lastAI.content))
      : 'Strategy task completed.';
    this.logger.info({ textLength: finalText.length }, 'Strategy subagent finished');
    return finalText;
  }
  async *streamEvents(context: SubagentContext, instruction: string, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
    this.logger.info({ subagent: this.getName() }, 'streamEvents starting');
    if (!this.hasMCPClient()) {
      throw new Error('MCP client not available for strategy subagent');
    }
    const initialMessages = this.buildMessages(context, instruction);
    const systemMessage = initialMessages[0];
    const humanMessage = initialMessages[initialMessages.length - 1];
    const agent = createReactAgent({
      llm: this.model,
      tools: this.tools,
      prompt: systemMessage as SystemMessage,
    });
    const stream = agent.stream(
      { messages: [humanMessage] },
      { streamMode: ['messages', 'updates'], recursionLimit: 30, signal }
    );
    let finalText = '';
    for await (const [mode, data] of await stream) {
      if (signal?.aborted) break;
      if (mode === 'messages') {
        for (const chunk of StrategySubagent.extractStreamChunks(data, this.config.name)) {
          yield chunk;
        }
      } else if (mode === 'updates') {
        if ((data as any).agent?.messages) {
          for (const msg of (data as any).agent.messages as any[]) {
            if (msg.tool_calls?.length) {
              for (const tc of msg.tool_calls) {
                yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: tc.name, label: tc.name };
              }
            } else {
              const content = StrategySubagent.extractFinalText(msg);
              if (content) finalText = content;
            }
          }
        }
      }
    }
    this.logger.info({ textLength: finalText.length }, 'streamEvents finished');
    return finalText;
  }
 }
 /**
 * Factory function to create and initialize StrategySubagent
 */
 export async function createStrategySubagent(
  model: BaseChatModel,
  logger: FastifyBaseLogger,
  basePath: string,
  mcpClient?: MCPClientConnector,
  tools?: any[]
 ): Promise<StrategySubagent> {
  const { readFile } = await import('fs/promises');
  const { join } = await import('path');
  const yaml = await import('js-yaml');
  const configPath = join(basePath, 'config.yaml');
  const configContent = await readFile(configPath, 'utf-8');
  const config = yaml.load(configContent) as SubagentConfig;
  const subagent = new StrategySubagent(config, model, logger, mcpClient, tools);
  await subagent.initialize(basePath);
  return subagent;
 }
--- a/gateway/src/harness/subagents/strategy/system-prompt.md
+++ b/gateway/src/harness/subagents/strategy/system-prompt.md
@@ -0,0 +1,357 @@
 # Strategy Subagent
 You are a specialized assistant for writing, testing, and managing trading strategies on the Dexorder platform. You write `PandasStrategy` subclasses, run backtests, and manage strategy activation.
 ---
 ## Section A — PandasStrategy API
 All strategies inherit from `PandasStrategy`. Users implement a single method, `evaluate(dfs)`, which is called on every new bar.
 ### Class structure
 ```python
 from dexorder.nautilus.pandas_strategy import PandasStrategy, PandasStrategyConfig
 class MyStrategy(PandasStrategy):
    def evaluate(self, dfs: dict[str, pd.DataFrame]) -> None:
        """
        Called after every new bar across all feeds.
        Args:
            dfs: dict mapping feed_key → pd.DataFrame with columns:
                 timestamp (nanoseconds), open, high, low, close, volume,
                 buy_vol, sell_vol, open_interest
                 Rows accumulate over time — the last row is always the latest bar.
        """
        df = dfs.get("BTC/USDT.BINANCE:300")
        if df is None or len(df) < 20:
            return  # Not enough data yet
        close = df["close"]
        # ... compute signals ...
        if buy_signal:
            self.buy(quantity=0.1)
        elif sell_signal:
            self.sell(quantity=0.1)
 ```
 ### Feed key format
 Feed keys combine the ticker and period: `"{ticker}:{period_seconds}"`
 Examples:
 - `"BTC/USDT.BINANCE:300"` — BTC/USDT on Binance, 5-minute bars
 - `"BTC/USDT.BINANCE:900"` — BTC/USDT on Binance, 15-minute bars
 - `"BTC/USDT.BINANCE:3600"` — BTC/USDT on Binance, 1-hour bars
 - `"ETH/USDT.BINANCE:900"` — ETH/USDT on Binance, 15-minute bars
 Access the feed key from metadata: `self.config.feed_keys` is a tuple of all feed keys.
 ### Order API
 ```python
 self.buy(quantity: float, feed_key: str = None)
 self.sell(quantity: float, feed_key: str = None)
 self.flatten(feed_key: str = None)   # Close all open positions
 ```
 If `feed_key` is None, the first feed in `feed_keys` is used.
 `quantity` is in base currency units (e.g. 0.1 BTC). Use `self.config.initial_capital` to size appropriately.
 ### Configuration available inside evaluate()
 ```python
 self.config.feed_keys           # tuple of feed key strings
 self.config.initial_capital     # starting capital in quote currency
 ```
 ### DataFrame columns
 | Column | Type | Description |
 |--------|------|-------------|
 | `timestamp` | int64 (ns) | Bar open time in nanoseconds |
 | `open` | float | Open price |
 | `high` | float | High price |
 | `low` | float | Low price |
 | `close` | float | Close price |
 | `volume` | float | Total volume |
 | `buy_vol` | float | Buy-side volume (taker buys) |
 | `sell_vol` | float | Sell-side volume (taker sells) |
 | `open_interest` | float | Open interest (futures only; NaN for spot) |
 ---
 ## Section B — Strategy Metadata
 When writing a strategy with `python_write(category="strategy", ...)`, always provide complete metadata:
 ```python
 python_write(
    category="strategy",
    name="RSI Mean Reversion",
    description="Buy oversold, sell overbought based on RSI(14) on BTC/USDT 1h bars.",
    code="""...""",
    metadata={
        "data_feeds": [
            {"symbol": "BTC/USDT.BINANCE", "period_seconds": 300, "description": "Primary BTC/USDT 5m feed"}
        ],
        "parameters": {
            "rsi_length": {"default": 14, "description": "RSI lookback period"},
            "oversold": {"default": 30, "description": "RSI oversold threshold"},
            "overbought": {"default": 70, "description": "RSI overbought threshold"},
            "trade_qty": {"default": 0.01, "description": "Trade quantity in BTC"}
        },
        "conda_packages": []
    }
 )
 ```
 ### Metadata fields
 | Field | Required | Description |
 |-------|----------|-------------|
 | `data_feeds` | yes | List of `{symbol, period_seconds, description}` — one per feed the strategy needs |
 | `parameters` | yes | Dict of `{param_name: {default, description}}` for user-configurable values |
 | `conda_packages` | no | Extra Python packages to install |
 ---
 ## Section C — Custom Indicators in Strategies
 **Prefer using custom indicators defined in the `indicator` category rather than computing signals inline.**
 Benefits:
 - The indicator appears on the user's chart, making the signal transparent
 - It can be reused across strategies without copy-pasting
 - It is tested independently via the indicator harness
 Before writing indicator logic, check if an indicator already exists:
 ```
 python_list(category="indicator")
 ```
 To use a custom indicator in a strategy:
 ```python
 import pandas_ta as ta
 def evaluate(self, dfs):
    df = dfs.get("BTC/USDT.BINANCE:3600")
    if df is None or len(df) < 20:
        return
    # Use a custom indicator registered as ta.custom_vw_rsi
    vw_rsi = ta.custom_vw_rsi(df["close"], df["volume"], length=14)
    if vw_rsi.iloc[-1] < 30:
        self.buy(0.01)
    elif vw_rsi.iloc[-1] > 70:
        self.sell(0.01)
 ```
 Custom indicator names follow the pattern `ta.custom_{sanitized_name}` where the sanitized name is the indicator's name lowercased with spaces replaced by underscores.
 **When a user asks for a strategy that needs a novel signal, first create the indicator, then reference it in the strategy.**
 ---
 ## Section D — Complete Strategy Examples
 ### Example 1: RSI Mean Reversion (simple, single feed)
 ```python
 import pandas as pd
 import pandas_ta as ta
 class RSIMeanReversion(PandasStrategy):
    def evaluate(self, dfs: dict[str, pd.DataFrame]) -> None:
        df = dfs.get("BTC/USDT.BINANCE:300")
        if df is None or len(df) < 30:
            return
        rsi = ta.rsi(df["close"], length=14)
        if rsi is None or rsi.isna().all():
            return
        last_rsi = rsi.iloc[-1]
        trade_qty = 0.001 * self.config.initial_capital / df["close"].iloc[-1]
        if last_rsi < 30:
            self.buy(trade_qty)
        elif last_rsi > 70:
            self.sell(trade_qty)
 ```
 Metadata:
 ```python
 {
    "data_feeds": [{"symbol": "BTC/USDT.BINANCE", "period_seconds": 300, "description": "BTC/USDT 5m"}],
    "parameters": {
        "rsi_length": {"default": 14, "description": "RSI period"},
        "oversold":   {"default": 30, "description": "Buy threshold"},
        "overbought": {"default": 70, "description": "Sell threshold"}
    },
    "conda_packages": []
 }
 ```
 ### Example 2: MACD Momentum (multi-feed dual timeframe)
 ```python
 import pandas as pd
 import pandas_ta as ta
 class MACDMomentum(PandasStrategy):
    def evaluate(self, dfs: dict[str, pd.DataFrame]) -> None:
        df_15m = dfs.get("BTC/USDT.BINANCE:900")
        df_4h = dfs.get("BTC/USDT.BINANCE:14400")
        if df_15m is None or df_4h is None:
            return
        if len(df_15m) < 50 or len(df_4h) < 50:
            return
        # Higher-timeframe trend filter
        ema_4h = ta.ema(df_4h["close"], length=20)
        bullish_trend = df_4h["close"].iloc[-1] > ema_4h.iloc[-1]
        # Entry signal on 15m
        macd_df = ta.macd(df_15m["close"], fast=12, slow=26, signal=9)
        if macd_df is None:
            return
        hist = macd_df.iloc[:, 2]  # histogram
        trade_qty = 0.002 * self.config.initial_capital / df_15m["close"].iloc[-1]
        if bullish_trend and hist.iloc[-1] > 0 and hist.iloc[-2] <= 0:
            self.buy(trade_qty, feed_key="BTC/USDT.BINANCE:900")
        elif hist.iloc[-1] < 0 and hist.iloc[-2] >= 0:
            self.flatten()
 ```
 Metadata:
 ```python
 {
    "data_feeds": [
        {"symbol": "BTC/USDT.BINANCE", "period_seconds": 900,   "description": "BTC/USDT 15m entry"},
        {"symbol": "BTC/USDT.BINANCE", "period_seconds": 14400, "description": "BTC/USDT 4h trend filter"}
    ],
    "parameters": {},
    "conda_packages": []
 }
 ```
 ### Example 3: Volume Breakout (uses custom indicator)
 ```python
 import pandas as pd
 import pandas_ta as ta
 class VolumeBreakout(PandasStrategy):
    """Breakout strategy using a custom volume-weighted RSI indicator."""
    def evaluate(self, dfs: dict[str, pd.DataFrame]) -> None:
        df = dfs.get("ETH/USDT.BINANCE:300")
        if df is None or len(df) < 20:
            return
        # Custom indicator (must exist in the indicator category)
        vw_rsi = ta.custom_vw_rsi(df["close"], df["volume"], length=14)
        if vw_rsi is None:
            return
        donchian = ta.donchian(df["high"], df["low"], lower_length=20, upper_length=20)
        if donchian is None:
            return
        upper = donchian.iloc[:, 0]
        close = df["close"]
        qty = 0.01 * self.config.initial_capital / close.iloc[-1]
        if close.iloc[-1] > upper.iloc[-2] and vw_rsi.iloc[-1] > 60:
            self.buy(qty)
        elif close.iloc[-1] < donchian.iloc[:, 1].iloc[-1]:
            self.flatten()
 ```
 ---
 ## Section E — Workflow
 ### Writing and validating a strategy
 1. **Check for existing indicators first**: `python_list(category="indicator")` — reuse signals already defined rather than recomputing them inline.
 2. **Write the strategy**:
   ```
   python_write(category="strategy", name="...", description="...", code="...", metadata={...})
   ```
   After writing, the system automatically runs the strategy against synthetic data. If validation fails, fix the reported error before proceeding.
 3. **Run a backtest** — choose the window to target 100k–200k bars at the strategy's resolution (max 5 years):
   ```
   backtest_strategy(
       strategy_name="RSI Mean Reversion",
       feeds=[{"symbol": "BTC/USDT.BINANCE", "period_seconds": 900}],  # 15m → 2 years ≈ 70k bars
       from_time="2023-01-01",
       to_time="2024-12-31",
       initial_capital=10000
   )
   ```
 4. **Interpret results**:
   - `summary.total_return` — total fractional return (0.15 = +15%)
   - `summary.sharpe_ratio` — annualized Sharpe (>1.0 good, >2.0 excellent)
   - `summary.max_drawdown` — maximum peak-to-trough loss (0.20 = 20%)
   - `summary.win_rate` — fraction of trades profitable
   - `statistics.profit_factor` — gross profit / gross loss (>1.5 good)
   - `statistics.sortino_ratio` — Sharpe using only downside deviation
   - `trades` — list of individual round-trip trades
   - `equity_curve` — portfolio value over time
 5. **Iterate**: edit with `python_edit`, re-run backtest, compare results. Use `get_backtest_results` to compare multiple runs.
 6. **Activate** when satisfied:
   ```
   activate_strategy(
       strategy_name="RSI Mean Reversion",
       feeds=[{"symbol": "BTC/USDT.BINANCE", "period_seconds": 900}],
       allocation=5000.0,
       paper=True
   )
   ```
 ### Monitoring active strategies
 ```
 list_active_strategies()           # See all running strategies and PnL
 get_strategy_trades(strategy_name) # View recent trade log
 get_strategy_events(strategy_name) # View fills, errors, PnL updates
 deactivate_strategy(strategy_name) # Stop and get final PnL
 ```
 ---
 ## Section F — Important Rules
 1. **Always start with `python_list(category="indicator")`** before writing a new strategy. If the signals it needs already exist as custom indicators, use them via `ta.custom_*` rather than duplicating the computation.
 2. **Wait for validation output** after `python_write` or `python_edit`. If the harness reports an error, fix it before running a backtest.
 3. **Size positions conservatively** based on `self.config.initial_capital`. A typical trade quantity is `0.001–0.01 * initial_capital / price`.
 4. **Guard for insufficient data**: always check `len(df) >= min_required` before computing indicators that need a lookback period.
 5. **Multi-feed strategies**: access each feed by its exact feed key. Missing feeds (not yet warmed up) will be absent from `dfs` — always use `.get()` and check for `None`.
 6. **Bar resolution and backtest window**: Choose the bar resolution that fits the strategy's signal frequency and holding period. Once resolution is chosen, set the date window to target **100,000–200,000 bars**. **Never request more than 5 years of data.** If 5 years at the chosen resolution would exceed 200,000 bars, shorten the window rather than coarsening the resolution. Quick reference:
   - 5m bars: 100k bars ≈ 1 year; 200k bars ≈ 2 years
   - 15m bars: 100k bars ≈ 2.9 years; 200k bars ≈ 5 years (at limit)
   - 1h bars: 100k bars ≈ 11.4 years → cap at 5 years (≈ 43,800 bars)
   - 4h bars: 100k bars ≈ 45 years → cap at 5 years (≈ 10,950 bars)
 7. **Never `import` from `dexorder` inside `evaluate()`** — the strategy file is exec'd in a sandbox with PandasStrategy and pandas_ta pre-loaded. Standard library and pandas/numpy/pandas_ta are available.
--- a/gateway/src/harness/subagents/web-explore/index.ts
+++ b/gateway/src/harness/subagents/web-explore/index.ts
@@ -3,6 +3,7 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 import { SystemMessage } from '@langchain/core/messages';
 import { createReactAgent } from '@langchain/langgraph/prebuilt';
 import type { FastifyBaseLogger } from 'fastify';
 import type { HarnessEvent } from '../../harness-events.js';
 /**
 * Web Explore Subagent
@@ -66,6 +67,52 @@ export class WebExploreSubagent extends BaseSubagent {
    return finalText;
  }
  async *streamEvents(context: SubagentContext, instruction: string, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
    this.logger.info({ subagent: this.getName() }, 'streamEvents starting');
    const initialMessages = this.buildMessages(context, instruction);
    const systemMessage = initialMessages[0];
    const humanMessage = initialMessages[initialMessages.length - 1];
    const agent = createReactAgent({
      llm: this.model,
      tools: this.tools,
      prompt: systemMessage as SystemMessage,
    });
    const stream = agent.stream(
      { messages: [humanMessage] },
      { streamMode: ['messages', 'updates'], recursionLimit: 15, signal }
    );
    let finalText = '';
    for await (const [mode, data] of await stream) {
      if (signal?.aborted) break;
      if (mode === 'messages') {
        for (const chunk of WebExploreSubagent.extractStreamChunks(data, this.config.name)) {
          yield chunk;
        }
      } else if (mode === 'updates') {
        if ((data as any).agent?.messages) {
          for (const msg of (data as any).agent.messages as any[]) {
            if (msg.tool_calls?.length) {
              for (const tc of msg.tool_calls) {
                yield { type: 'subagent_tool_call', agentName: this.config.name, toolName: tc.name, label: tc.name };
              }
            } else {
              const content = WebExploreSubagent.extractFinalText(msg);
              if (content) finalText = content;
            }
          }
        }
      }
    }
    this.logger.info({ textLength: finalText.length }, 'streamEvents finished');
    return finalText;
  }
 }
 /**
--- a/gateway/src/main.ts
+++ b/gateway/src/main.ts
@@ -16,6 +16,8 @@ import { ContainerManager } from './k8s/container-manager.js';
 import { ZMQRelayClient } from './clients/zmq-relay-client.js';
 import { IcebergClient } from './clients/iceberg-client.js';
 import { ConversationStore } from './harness/memory/conversation-store.js';
 import { BlobStore } from './harness/memory/blob-store.js';
 import { ConversationService } from './services/conversation-service.js';
 import { AgentHarness, type HarnessSessionConfig } from './harness/agent-harness.js';
 import { OHLCService } from './services/ohlc-service.js';
 import { SymbolIndexService } from './services/symbol-index-service.js';
@@ -369,12 +371,17 @@ try {
 const conversationStore = new ConversationStore(redis, app.log, icebergClient);
 app.log.debug('Conversation store initialized');
 const blobStore = new BlobStore(icebergClient, app.log);
 const conversationService = new ConversationService(conversationStore, blobStore, app.log);
 app.log.debug('Blob store and conversation service initialized');
 // Harness factory: captures infrastructure deps; channel handlers stay infrastructure-free
 function createHarness(sessionConfig: HarnessSessionConfig): AgentHarness {
  return new AgentHarness({
    ...sessionConfig,
    providerConfig: config.providerConfig,
    conversationStore,
    blobStore,
    historyLimit: config.conversationHistoryLimit,
  });
 }
@@ -391,6 +398,7 @@ const websocketHandler = new WebSocketHandler({
  createHarness,
  ohlcService,  // Optional
  symbolIndexService,  // Optional
  conversationService,  // Optional - for history replay on reconnect
 });
 app.log.debug('WebSocket handler initialized');
@@ -614,6 +622,19 @@ try {
      mcpTools: [],
    });
    // Strategy subagent: all strategy-related MCP tools
    toolRegistry.registerAgentTools({
      agentName: 'strategy',
      platformTools: [],
      mcpTools: [
        'python_write', 'python_edit', 'python_read', 'python_list',
        'python_log', 'python_revert',
        'backtest_strategy', 'activate_strategy', 'deactivate_strategy',
        'list_active_strategies', 'get_backtest_results',
        'get_strategy_trades', 'get_strategy_events',
      ],
    });
    app.log.info(
      {
        agents: toolRegistry.getRegisteredAgents(),
--- a/gateway/src/services/conversation-service.ts
+++ b/gateway/src/services/conversation-service.ts
@@ -0,0 +1,59 @@
 import type { FastifyBaseLogger } from 'fastify';
 import type { ConversationStore } from '../harness/memory/conversation-store.js';
 import type { BlobStore, StoredBlob } from '../harness/memory/blob-store.js';
 export interface EnrichedMessage {
  id: string;
  userId: string;
  sessionId: string;
  role: 'user' | 'assistant';
  content: string;
  timestamp: number; // microseconds
  files: StoredBlob[];
 }
 /**
 * Generic conversation history service.
 *
 * Combines text messages (ConversationStore) with binary blobs (BlobStore)
 * into enriched message records. Used by:
 * - WebSocket handler: replay history on reconnect
 * - Future admin panel: conversation browser
 */
 export class ConversationService {
  constructor(
    private conversationStore: ConversationStore,
    private blobStore: BlobStore,
    // eslint-disable-next-line @typescript-eslint/no-unused-vars
    _logger: FastifyBaseLogger
  ) {}
  async getHistory(
    userId: string,
    sessionId: string,
    limit = 50,
    channelType = 'websocket'
  ): Promise<EnrichedMessage[]> {
    const messages = await this.conversationStore.getFullHistory(userId, sessionId, limit, channelType);
    const chatMessages = messages.filter(m => m.role === 'user' || m.role === 'assistant');
    return Promise.all(
      chatMessages.map(async (m) => {
        const blobRefs = m.metadata?.blobs as Array<{ id: string; mimeType: string; caption?: string }> | undefined;
        const files = blobRefs?.length
          ? await this.blobStore.getBlobsByIds(userId, sessionId, blobRefs.map(b => b.id))
          : [];
        return {
          id: m.id,
          userId: m.userId,
          sessionId: m.sessionId,
          role: m.role as 'user' | 'assistant',
          content: m.content,
          timestamp: m.timestamp,
          files,
        };
      })
    );
  }
 }
--- a/gateway/src/services/ohlc-service.ts
+++ b/gateway/src/services/ohlc-service.ts
@@ -16,7 +16,8 @@
 import type { FastifyBaseLogger } from 'fastify';
 import type { IcebergClient } from '../clients/iceberg-client.js';
-import type { ZMQRelayClient } from '../clients/zmq-relay-client.js';
+import type { ZMQRelayClient, BarUpdateCallback } from '../clients/zmq-relay-client.js';
 export type { BarUpdateCallback } from '../clients/zmq-relay-client.js';
 import type {
  HistoryResult,
  SymbolInfo,
@@ -53,6 +54,23 @@ export class OHLCService {
    this.logger = config.logger;
  }
  /**
   * Subscribe to realtime OHLC bar updates for a ticker+period.
   * ZMQ subscribe is issued on the first call for a given topic; subsequent calls
   * for the same topic only add the callback (no extra ZMQ events).
   */
  subscribeToTicker(ticker: string, periodSeconds: number, callback: BarUpdateCallback): void {
    this.relayClient.subscribeToTicker(ticker, periodSeconds, callback);
  }
  /**
   * Unsubscribe a callback from realtime OHLC bar updates.
   * ZMQ unsubscribe is issued when the last callback for a topic is removed.
   */
  unsubscribeFromTicker(ticker: string, periodSeconds: number, callback: BarUpdateCallback): void {
    this.relayClient.unsubscribeFromTicker(ticker, periodSeconds, callback);
  }
  /**
   * Fetch OHLC data with smart caching
   *
--- a/gateway/src/tools/platform/get-chart-data.tool.ts
+++ b/gateway/src/tools/platform/get-chart-data.tool.ts
@@ -28,23 +28,29 @@ export function createGetChartDataTool(config: GetChartDataToolConfig): DynamicS
 **IMPORTANT: Use this tool ONLY for quick, casual data viewing. For any analysis, plotting, statistics, or deep research, use the 'research' tool instead.**
 **Hard limit: returns at most 500 bars (the most recent 500). This tool is not suitable for analysis requiring longer sequences — use the 'research' tool for that.**
 Parameters:
- ticker (optional): Market symbol (defaults to workspace chartState.symbol)
+- ticker (optional): Market symbol in SYMBOL.EXCHANGE format, e.g. "BTC/USDT.BINANCE" (defaults to workspace chartState.symbol)
 - period (optional): OHLC period in seconds (defaults to workspace chartState.period)
 - from_time (optional): Start time as Unix timestamp (number or string like "1774126800") OR date string like "2 days ago", "2024-01-01" (defaults to workspace chartState.start_time)
 - to_time (optional): End time as Unix timestamp (number or string like "1774732500") OR date string like "now", "yesterday" (defaults to workspace chartState.end_time)
- countback (optional): Limit number of bars returned
+- countback (optional): Limit number of bars returned (max 500)
 - columns (optional): Extra columns beyond OHLC: ["volume", "buy_vol", "sell_vol", "open_time", "high_time", "low_time", "close_time", "open_interest"]`,
    schema: z.object({
      ticker: z.string().optional().describe('Market symbol (defaults to workspace chartState.symbol)'),
      period: z.number().optional().describe('OHLC period in seconds (defaults to workspace chartState.period)'),
      from_time: z.union([z.number(), z.string()]).optional().describe('Start time: Unix seconds OR date string (defaults to workspace chartState.start_time)'),
      to_time: z.union([z.number(), z.string()]).optional().describe('End time: Unix seconds OR date string (defaults to workspace chartState.end_time)'),
-      countback: z.number().optional().describe('Limit number of bars returned'),
+      countback: z.number().optional().describe('Limit number of bars returned (max 500)'),
      columns: z.array(z.enum(['volume', 'buy_vol', 'sell_vol', 'open_time', 'high_time', 'low_time', 'close_time', 'open_interest'])).optional().describe('Extra columns beyond OHLC'),
    }),
    func: async ({ ticker, period, from_time, to_time, countback, columns }) => {
-      logger.debug({ ticker, period, from_time, to_time, countback, columns }, 'Executing get_chart_data tool');
+      const MAX_BARS = 500;
      // Enforce hard cap — never return more than MAX_BARS bars
      const effectiveCountback = countback !== undefined ? Math.min(countback, MAX_BARS) : MAX_BARS;
      logger.debug({ ticker, period, from_time, to_time, countback: effectiveCountback, columns }, 'Executing get_chart_data tool');
      try {
        // Get workspace chart state
@@ -86,7 +92,7 @@ Parameters:
          finalPeriod,
          finalFromTime,
          finalToTime,
-          countback
+          effectiveCountback
        );
        if (historyResult.noData || !historyResult.bars || historyResult.bars.length === 0) {
@@ -98,8 +104,13 @@ Parameters:
          });
        }
        // Enforce hard cap — keep the most recent bars
        const sourceBars = historyResult.bars.length > MAX_BARS
          ? historyResult.bars.slice(-MAX_BARS)
          : historyResult.bars;
        // Filter/format bars with requested columns
-        const bars = historyResult.bars.map(bar => {
+        const bars = sourceBars.map(bar => {
          const result: any = {
            time: bar.time,
            open: bar.open,
--- a/gateway/src/tools/platform/indicator-agent.tool.ts
+++ b/gateway/src/tools/platform/indicator-agent.tool.ts
@@ -3,6 +3,7 @@ import { z } from 'zod';
 import type { FastifyBaseLogger } from 'fastify';
 import type { IndicatorSubagent } from '../../harness/subagents/indicator/index.js';
 import type { SubagentContext } from '../../harness/subagents/base-subagent.js';
 import type { HarnessEvent } from '../../harness/harness-events.js';
 export interface IndicatorAgentToolConfig {
  indicatorSubagent: IndicatorSubagent;
@@ -14,10 +15,20 @@ export interface IndicatorAgentToolConfig {
 * Creates a LangChain tool that delegates to the indicator subagent.
 * Mirrors the pattern of research-agent.tool.ts.
 */
-export function createIndicatorAgentTool(config: IndicatorAgentToolConfig): DynamicStructuredTool {
+export function createIndicatorAgentTool(config: IndicatorAgentToolConfig): DynamicStructuredTool & { streamFunc: (args: { instruction: string }) => AsyncGenerator<HarnessEvent, string> } {
  const { indicatorSubagent, context, logger } = config;
-  return new DynamicStructuredTool({
+  async function* streamFunc({ instruction }: { instruction: string }, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
    logger.info({ instruction: instruction.substring(0, 100) }, 'Streaming indicator subagent');
    const gen = indicatorSubagent.streamEvents(context, instruction, signal);
    let step: IteratorResult<HarnessEvent, string>;
    while (!(step = await gen.next()).done) {
      yield step.value;
    }
    return step.value;
  }
  const tool = new DynamicStructuredTool({
    name: 'indicator',
    description: `Delegate to the indicator subagent for all indicator-related tasks on the chart.
@@ -50,4 +61,6 @@ NEVER modify the indicators workspace store directly.`,
      }
    },
  });
  return Object.assign(tool, { streamFunc });
 }
--- a/gateway/src/tools/platform/research-agent.tool.ts
+++ b/gateway/src/tools/platform/research-agent.tool.ts
@@ -3,6 +3,7 @@ import { z } from 'zod';
 import type { FastifyBaseLogger } from 'fastify';
 import type { ResearchSubagent } from '../../harness/subagents/research/index.js';
 import type { SubagentContext } from '../../harness/subagents/base-subagent.js';
 import type { HarnessEvent } from '../../harness/harness-events.js';
 export interface ResearchAgentToolConfig {
  researchSubagent: ResearchSubagent;
@@ -15,10 +16,24 @@ export interface ResearchAgentToolConfig {
 * This is the standard LangChain pattern for exposing a subagent as a tool
 * to a parent agent.
 */
-export function createResearchAgentTool(config: ResearchAgentToolConfig): DynamicStructuredTool {
+export function createResearchAgentTool(config: ResearchAgentToolConfig): DynamicStructuredTool & { streamFunc: (args: { name: string; instruction: string }) => AsyncGenerator<HarnessEvent, string> } {
  const { researchSubagent, context, logger } = config;
-  return new DynamicStructuredTool({
+  const prompt = (name: string, instruction: string) => `Research script name: "${name}"\n\n${instruction}`;
  async function* streamFunc({ name, instruction }: { name: string; instruction: string }, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
    logger.info({ name, instruction: instruction.substring(0, 100) }, 'Streaming research subagent');
    const gen = researchSubagent.streamEvents(context, prompt(name, instruction), signal);
    let step: IteratorResult<HarnessEvent, string>;
    while (!(step = await gen.next()).done) {
      yield step.value;
    }
    const finalText = step.value;
    const images = researchSubagent.getLastImages();
    return JSON.stringify({ text: finalText, images });
  }
  const tool = new DynamicStructuredTool({
    name: 'research',
    description: `Delegate to the research subagent for data analysis, charting, statistics, and Python script execution.
@@ -36,21 +51,15 @@ The research subagent will write and execute Python scripts, capture output and
    func: async ({ name, instruction }: { name: string; instruction: string }): Promise<string> => {
      logger.info({ name, instruction: instruction.substring(0, 100) }, 'Delegating to research subagent');
      const prompt = `Research script name: "${name}"\n\n${instruction}`;
      try {
-        const result = await researchSubagent.executeWithImages(context, prompt);
+        const result = await researchSubagent.executeWithImages(context, prompt(name, instruction));
-
+        return JSON.stringify({ text: result.text, images: result.images });
        // Return in the format that AgentHarness.processToolResult() knows how to handle
        // (extracts images and passes them to channelAdapter)
        return JSON.stringify({
          text: result.text,
          images: result.images,
        });
      } catch (error) {
        logger.error({ error, errorMessage: (error as Error)?.message }, 'Research subagent failed');
        throw error;
      }
    },
  });
  return Object.assign(tool, { streamFunc });
 }
--- a/gateway/src/tools/platform/strategy-agent.tool.ts
+++ b/gateway/src/tools/platform/strategy-agent.tool.ts
@@ -0,0 +1,66 @@
 import { DynamicStructuredTool } from '@langchain/core/tools';
 import { z } from 'zod';
 import type { FastifyBaseLogger } from 'fastify';
 import type { StrategySubagent } from '../../harness/subagents/strategy/index.js';
 import type { SubagentContext } from '../../harness/subagents/base-subagent.js';
 import type { HarnessEvent } from '../../harness/harness-events.js';
 export interface StrategyAgentToolConfig {
  strategySubagent: StrategySubagent;
  context: SubagentContext;
  logger: FastifyBaseLogger;
 }
 /**
 * Creates a LangChain tool that delegates to the strategy subagent.
 * Mirrors the pattern of indicator-agent.tool.ts.
 */
 export function createStrategyAgentTool(config: StrategyAgentToolConfig): DynamicStructuredTool & { streamFunc: (args: { instruction: string }, signal?: AbortSignal) => AsyncGenerator<HarnessEvent, string> } {
  const { strategySubagent, context, logger } = config;
  async function* streamFunc({ instruction }: { instruction: string }, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
    logger.info({ instruction: instruction.substring(0, 100) }, 'Streaming strategy subagent');
    const gen = strategySubagent.streamEvents(context, instruction, signal);
    let step: IteratorResult<HarnessEvent, string>;
    while (!(step = await gen.next()).done) {
      yield step.value;
    }
    return step.value;
  }
  const tool = new DynamicStructuredTool({
    name: 'strategy',
    description: `Delegate to the strategy subagent for all trading strategy tasks.
 Use this tool for:
 - Writing new PandasStrategy classes ("create a strategy that...")
 - Editing or improving existing strategies
 - Running backtests on a strategy
 - Interpreting backtest results (Sharpe ratio, drawdown, trade list)
 - Activating or deactivating strategies for paper trading
 - Monitoring running strategy PnL and trade logs
 - Checking which strategies already exist
 ALWAYS use this tool for any request about trading strategies, backtesting, or strategy activation.
 NEVER write strategy Python code or call backtest_strategy directly — delegate here instead.`,
    schema: z.object({
      instruction: z.string().describe(
        'The strategy task to perform. Be specific: include the strategy name, ' +
        'desired signals (e.g. RSI < 30 = buy), timeframe, and symbol if known. ' +
        'For backtest requests include the date range and starting capital.'
      ),
    }),
    func: async ({ instruction }: { instruction: string }): Promise<string> => {
      logger.info({ instruction: instruction.substring(0, 100) }, 'Delegating to strategy subagent');
      try {
        return await strategySubagent.execute(context, instruction);
      } catch (error) {
        logger.error({ error, errorMessage: (error as Error)?.message }, 'Strategy subagent failed');
        throw error;
      }
    },
  });
  return Object.assign(tool, { streamFunc });
 }
--- a/gateway/src/tools/platform/web-explore-agent.tool.ts
+++ b/gateway/src/tools/platform/web-explore-agent.tool.ts
@@ -3,6 +3,7 @@ import { z } from 'zod';
 import type { FastifyBaseLogger } from 'fastify';
 import type { WebExploreSubagent } from '../../harness/subagents/web-explore/index.js';
 import type { SubagentContext } from '../../harness/subagents/base-subagent.js';
 import type { HarnessEvent } from '../../harness/harness-events.js';
 export interface WebExploreAgentToolConfig {
  webExploreSubagent: WebExploreSubagent;
@@ -14,10 +15,20 @@ export interface WebExploreAgentToolConfig {
 * Creates a LangChain tool that delegates to the web-explore subagent.
 * The subagent decides whether to use web search or arXiv based on the instruction.
 */
-export function createWebExploreAgentTool(config: WebExploreAgentToolConfig): DynamicStructuredTool {
+export function createWebExploreAgentTool(config: WebExploreAgentToolConfig): DynamicStructuredTool & { streamFunc: (args: { instruction: string }, signal?: AbortSignal) => AsyncGenerator<HarnessEvent, string> } {
  const { webExploreSubagent, context, logger } = config;
-  return new DynamicStructuredTool({
+  async function* streamFunc({ instruction }: { instruction: string }, signal?: AbortSignal): AsyncGenerator<HarnessEvent, string> {
    logger.info({ instruction: instruction.substring(0, 100) }, 'Streaming web-explore subagent');
    const gen = webExploreSubagent.streamEvents(context, instruction, signal);
    let step: IteratorResult<HarnessEvent, string>;
    while (!(step = await gen.next()).done) {
      yield step.value;
    }
    return step.value;
  }
  const tool = new DynamicStructuredTool({
    name: 'web_explore',
    description: `Search the web or academic databases and return a summarized answer.
@@ -46,4 +57,6 @@ The subagent will search the web (or arXiv for academic queries), fetch relevant
      }
    },
  });
  return Object.assign(tool, { streamFunc });
 }
--- a/gateway/src/workspace/types.ts
+++ b/gateway/src/workspace/types.ts
@@ -103,6 +103,16 @@ export const DEFAULT_STORES: StoreConfig[] = [
    persistent: true,
    initialState: () => ({}),
  },
  {
    name: 'strategy_types',
    persistent: true,
    initialState: () => ({}),
  },
  {
    name: 'research_types',
    persistent: true,
    initialState: () => ({}),
  },
  {
    name: 'channelState',
    persistent: false,
--- a/ingestor/src/index.js
+++ b/ingestor/src/index.js
@@ -47,24 +47,22 @@ function loadConfig() {
    logger.warn({ error: error.message }, 'Could not load secrets');
  }
  // Merge config and secrets
  return {
    // Flink ZMQ endpoints
    flink_hostname: config.flink_hostname || 'localhost',
-    ingestor_work_port: config.ingestor_work_port || 5555,
+    ingestor_broker_port: config.ingestor_broker_port || 5567,
    ingestor_control_port: config.ingestor_control_port || 5556,
    // Kafka configuration
    kafka_brokers: config.kafka_brokers || ['localhost:9092'],
-    kafka_topic: 'market-ohlc',
+    kafka_ohlc_topic: config.kafka_ohlc_topic || 'market-ohlc',
    kafka_tick_topic: config.kafka_tick_topic || 'market-tick',
    // Worker configuration
    max_concurrent: config.max_concurrent || 10,
    poll_interval_ms: config.poll_interval_ms || 10000,
    // Symbol metadata configuration
    supported_exchanges: config.supported_exchanges || ['binance', 'coinbase', 'kraken'],
-    symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000, // 6 hours
+    symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000,
    ...secrets
  };
@@ -76,11 +74,7 @@ class IngestorWorker {
    this.logger = logger;
    this.zmqClient = new ZmqClient(config, logger.child({ component: 'zmq' }));
-    this.kafkaProducer = new KafkaProducer(
+    this.kafkaProducer = new KafkaProducer(config, logger.child({ component: 'kafka' }));
      config,
      logger.child({ component: 'kafka' })
    );
    // Create metadata generator first so ccxtFetcher can use it
    this.metadataGenerator = new SymbolMetadataGenerator(
      config,
      this.kafkaProducer,
@@ -94,33 +88,27 @@ class IngestorWorker {
    this.realtimePoller = new RealtimePoller(
      this.ccxtFetcher,
      this.kafkaProducer,
      this.zmqClient,
      logger.child({ component: 'poller' })
    );
-    // Track active requests
+    // jobId → active realtime subscription (for stop handling)
-    this.activeRequests = new Map();
+    this.activeRealtime = new Set();
    this.isShutdown = false;
-    // Metadata generation interval
+    this.isShutdown = false;
    this.metadataIntervalMs = config.symbol_metadata_interval_ms;
    this.metadataInterval = null;
  }
  /**
   * Start the ingestor worker
   */
  async start() {
    this.logger.info('Starting CCXT ingestor worker');
    // Connect to services
    await this.kafkaProducer.connect();
    await this.zmqClient.connect();
-    // Start control message listener
+    // Wire event callbacks before connecting so we don't miss early messages
-    this.zmqClient.startControlListener(msg => this.handleControlMessage(msg));
+    this.zmqClient.onWorkAssign = req => this.handleWorkAssign(req);
    this.zmqClient.onWorkStop = jobId => this.handleWorkStop(jobId);
-    // Start work loop
+    await this.zmqClient.connect(); // also sends WorkerReady
    this.workLoop();
    // Generate symbol metadata on startup
    this.logger.info('Generating initial symbol metadata');
@@ -140,281 +128,126 @@ class IngestorWorker {
      } catch (error) {
        this.logger.error({ error: error.message }, 'Failed to generate periodic symbol metadata');
      }
-    }, this.metadataIntervalMs);
+    }, this.config.symbol_metadata_interval_ms);
    this.logger.info('Ingestor worker started successfully');
  }
  /**
-   * Main work loop - pull and process data requests
+   * Handle a WorkAssign message dispatched by Flink IngestorBroker.
   * Called from the ZmqClient receive loop — do not block.
   */
-  async workLoop() {
+  handleWorkAssign(request) {
-    while (!this.isShutdown) {
+    const { jobId, requestId, type, ticker } = request;
      try {
        // Check if we can handle more requests
        if (this.activeRequests.size >= this.config.max_concurrent) {
          await new Promise(resolve => setTimeout(resolve, 1000));
          continue;
        }
-        // Pull next data request
+    this.logger.info({ jobId, requestId, type, ticker }, 'Received WorkAssign');
        const request = await this.zmqClient.pullDataRequest();
        if (!request) {
          continue;
        }
-        // Handle request asynchronously
+    // HISTORICAL_OHLC = 0 (proto3 default, may appear as undefined or 'HISTORICAL_OHLC')
-        this.handleDataRequest(request).catch(error => {
+    const isHistorical = !type || type === 'HISTORICAL_OHLC' || type === 0;
-          this.logger.error(
+    const isRealtime = type === 'REALTIME_TICKS' || type === 1;
-            { error: error.message, requestId: request.requestId },
+
-            'Error handling data request'
+    if (isHistorical) {
-          );
+      this.handleHistoricalRequest(request).catch(err => {
-        });
+        this.logger.error({ jobId, requestId, error: err.message }, 'Unexpected error in historical handler');
-      } catch (error) {
+      });
-        if (!this.isShutdown) {
+    } else if (isRealtime) {
-          this.logger.error({ error: error.message }, 'Error in work loop');
+      this.handleRealtimeRequest(request);
-          await new Promise(resolve => setTimeout(resolve, 1000));
+    } else {
-        }
+      this.logger.warn({ jobId, type }, 'Unknown request type — rejecting');
-      }
+      this.zmqClient.sendReject(jobId, `Unknown request type: ${type}`).catch(() => {});
    }
  }
  /**
-   * Handle a data request
+   * Handle WorkStop sent by Flink (e.g., all subscribers left).
   */
-  async handleDataRequest(request) {
+  handleWorkStop(jobId) {
-    const { requestId: request_id, type, ticker } = request;
+    this.logger.info({ jobId }, 'Received WorkStop — cancelling realtime subscription');
-
+    this.realtimePoller.cancelSubscription(jobId);
-    this.logger.info({ request_id, type, ticker, fullRequest: request }, 'Handling data request');
+    this.activeRealtime.delete(jobId);
-
+    // No WorkComplete needed — Flink sent the stop, it already knows
    this.activeRequests.set(request_id, request);
    try {
      // HISTORICAL_OHLC = 0 is the proto3 default and is omitted from the wire,
      // so protobufjs decodes it as undefined. Treat undefined as HISTORICAL_OHLC.
      const isHistorical = type === undefined || type === 'HISTORICAL_OHLC' || type === 0;
      const isRealtime = type === 'REALTIME_TICKS' || type === 1;
      if (isHistorical) {
        await this.handleHistoricalRequest(request);
      } else if (isRealtime) {
        await this.handleRealtimeRequest(request);
      } else {
        this.logger.warn({ request_id, type, typeOf: typeof type, fullRequest: request }, 'Unknown request type');
      }
    } finally {
      // For historical requests, remove from active requests when done
      const isHistorical = type === undefined || type === 'HISTORICAL_OHLC' || type === 0;
      if (isHistorical) {
        this.activeRequests.delete(request_id);
      }
    }
  }
  /**
-   * Handle historical OHLC request
+   * Fetch historical OHLC data and write to Kafka.
-   * ASYNC ARCHITECTURE: No response sent back. Data written to Kafka only.
+   * Sends WorkComplete when done (success or error).
   * Flink will process from Kafka, write to Iceberg, and publish notification.
   */
  async handleHistoricalRequest(request) {
-    const { requestId: request_id, ticker, historical, clientId: client_id } = request;
+    const { jobId, requestId, ticker, historical, clientId: client_id } = request;
-    const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical;
+    const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical || {};
-    this.logger.info(
+    this.logger.info({ jobId, requestId, ticker, period_seconds }, 'Processing historical OHLC request');
      { request_id, ticker, period_seconds, client_id },
      'Processing historical OHLC request (async mode - write to Kafka only)'
    );
    try {
      // Fetch historical data from exchange
      const candles = await this.ccxtFetcher.fetchHistoricalOHLC(
-        ticker,
+        ticker, start_time, end_time, period_seconds, limit
        start_time,
        end_time,
        period_seconds,
        limit
      );
-      this.logger.info(
+      this.logger.info({ jobId, requestId, ticker, count: candles.length }, 'Fetched from exchange');
        { request_id, ticker, count: candles.length },
        'Fetched data from exchange'
      );
      // Write to Kafka - THIS IS THE ONLY OUTPUT
      // Flink will:
      // 1. Read from Kafka
      // 2. Write to Iceberg
      // 3. Publish HistoryReadyNotification
      // 4. Client receives notification via relay pub/sub
      if (candles.length > 0) {
-        // Add metadata to first candle for Flink tracking
+        const metadata = { request_id: requestId, client_id, ticker, period_seconds, start_time, end_time };
-        const enrichedCandles = candles.map((candle, idx) => ({
+        const PAGE_SIZE = 1000;
-          ...candle,
+        for (let i = 0; i < candles.length; i += PAGE_SIZE) {
-          __metadata: idx === 0 ? {
+          const page = candles.slice(i, i + PAGE_SIZE);
-            request_id,
+          const isLastPage = (i + PAGE_SIZE) >= candles.length;
-            client_id,
+          await this.kafkaProducer.writeOHLCs(this.config.kafka_ohlc_topic, page, metadata, isLastPage);
            ticker,
            period_seconds,
            start_time,
            end_time
          } : undefined
        }));
        await this.kafkaProducer.writeOHLCs(this.config.kafka_topic, enrichedCandles);
      } else {
        // Write a marker message even if no data found
        // Flink will see this and publish a NOT_FOUND notification
        await this.kafkaProducer.writeMarker(this.config.kafka_topic, {
          request_id,
          client_id,
          ticker,
          period_seconds,
          start_time,
          end_time,
          status: 'NOT_FOUND',
          message: 'No data available for requested period'
        });
      }
      this.logger.info(
        { request_id, ticker, count: candles.length },
        'Completed historical OHLC request - data written to Kafka'
      );
      // NO RESPONSE SENT - Relay is stateless, client waits for pub/sub notification
    } catch (error) {
      this.logger.error(
        {
          errorType: error.constructor?.name,
          error: error.message,
          errorUrl: error.url,
          request_id,
          ticker,
          stack: error.stack
        },
        'Failed to process historical request'
      );
      // Write error marker to Kafka so Flink can notify client
      try {
        await this.kafkaProducer.writeMarker(this.config.kafka_topic, {
          request_id,
          client_id,
          ticker,
          period_seconds,
          start_time,
          end_time,
          status: 'ERROR',
          error_message: error.message
        });
      } catch (kafkaError) {
        this.logger.error(
          { error: kafkaError.message, request_id },
          'Failed to write error marker to Kafka'
        );
      }
      // Do not throw - request is handled, Flink will notify client of error
    }
  }
  /**
   * Handle realtime tick subscription request
   */
  async handleRealtimeRequest(request) {
    const { requestId: request_id, ticker } = request;
    this.logger.info(
      { request_id, ticker },
      'Processing realtime subscription request'
    );
    try {
      // Start realtime polling
      this.realtimePoller.startSubscription(
        request_id,
        ticker,
        this.config.kafka_topic
      );
    } catch (error) {
      this.logger.error(
        { error: error.message, request_id, ticker },
        'Failed to start realtime subscription'
      );
      this.activeRequests.delete(request_id);
      throw error;
    }
  }
  /**
   * Handle control messages from Flink
   */
  async handleControlMessage(message) {
    const { action, requestId: request_id } = message;
    this.logger.info({ action, request_id }, 'Received control message');
    switch (action) {
      case 'CANCEL':
        if (request_id) {
          // Cancel specific request
          this.realtimePoller.cancelSubscription(request_id);
          this.activeRequests.delete(request_id);
        }
-        break;
+        this.logger.info({ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) }, 'Wrote all pages to Kafka');
      } else {
        await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
          request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
          status: 'NOT_FOUND', message: 'No data available for requested period'
        });
      }
-      case 'SHUTDOWN':
+      this.logger.info({ jobId, requestId, ticker }, 'Historical request complete — sending WorkComplete');
-        this.logger.info('Received shutdown signal');
+      await this.zmqClient.sendComplete(jobId, true);
        await this.shutdown();
        break;
-      case 'CONFIG_UPDATE':
+    } catch (error) {
-        // Handle config update if needed
+      this.logger.error({ jobId, requestId, ticker, error: error.message }, 'Historical request failed');
        this.logger.info('Received config update');
        break;
-      case 'HEARTBEAT':
+      try {
-        // Just acknowledge heartbeat
+        await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
-        break;
+          request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
          status: 'ERROR', error_message: error.message
        });
      } catch (kafkaErr) {
        this.logger.error({ jobId, error: kafkaErr.message }, 'Failed to write error marker to Kafka');
      }
-      default:
+      await this.zmqClient.sendComplete(jobId, false, error.message);
        this.logger.warn({ action }, 'Unknown control action');
    }
  }
  /**
-   * Get worker status
+   * Start realtime tick polling for a job dispatched by Flink.
   * Ticks flow: exchange → Kafka market-tick → Flink → OHLC bars → clients.
   */
  handleRealtimeRequest(request) {
    const { jobId, requestId, ticker } = request;
    this.logger.info({ jobId, requestId, ticker }, 'Processing realtime subscription request');
    this.activeRealtime.add(jobId);
    this.realtimePoller.startSubscription(jobId, requestId, ticker, this.config.kafka_tick_topic);
  }
  getStatus() {
    return {
-      activeRequests: this.activeRequests.size,
+      activeRealtime: this.activeRealtime.size,
      maxConcurrent: this.config.max_concurrent,
      pollerStats: this.realtimePoller.getStats(),
      metadataStatus: this.metadataGenerator.getStatus()
    };
  }
  /**
   * Shutdown worker gracefully
   */
  async shutdown() {
-    if (this.isShutdown) {
+    if (this.isShutdown) return;
      return;
    }
    this.isShutdown = true;
    this.logger.info('Shutting down ingestor worker');
-    // Stop metadata generation interval
+    if (this.metadataInterval) clearInterval(this.metadataInterval);
    if (this.metadataInterval) {
      clearInterval(this.metadataInterval);
    }
    // Stop polling
    this.realtimePoller.shutdown();
    // Close connections
    await this.ccxtFetcher.close();
    await this.metadataGenerator.close();
    await this.kafkaProducer.disconnect();
@@ -430,31 +263,23 @@ async function main() {
  const config = loadConfig();
  const worker = new IngestorWorker(config, logger);
  // Handle shutdown signals
  process.on('SIGINT', () => worker.shutdown());
  process.on('SIGTERM', () => worker.shutdown());
  // Handle errors
  process.on('uncaughtException', error => {
    logger.error({ error }, 'Uncaught exception');
    worker.shutdown();
  });
-
+  process.on('unhandledRejection', reason => {
  process.on('unhandledRejection', (reason, promise) => {
    logger.error({ reason }, 'Unhandled rejection');
  });
  // Start worker
  await worker.start();
  // Log status periodically
  setInterval(() => {
-    const status = worker.getStatus();
+    logger.info({ status: worker.getStatus() }, 'Worker status');
    logger.info({ status }, 'Worker status');
  }, 60000);
 }
 // Run
 main().catch(error => {
  logger.error({ error }, 'Fatal error');
  process.exit(1);
--- a/ingestor/src/kafka-producer.js
+++ b/ingestor/src/kafka-producer.js
@@ -116,12 +116,17 @@ export class KafkaProducer {
  }
  /**
-   * Write multiple OHLC candles to Kafka as an OHLCBatch message
+   * Write multiple OHLC candles to Kafka as an OHLCBatch message.
-   * Uses protobuf encoding with metadata in batch wrapper
+   *
   * Historical mode: pass explicit metadata and isLastPage flag.
   * Realtime mode: omit metadata (null/undefined) — writes individual OHLC messages instead.
   *
   * @param {string} topic - Kafka topic name
-   * @param {Array<object>} ohlcData - Array of OHLC data objects (may include __metadata in first record)
+   * @param {Array<object>} ohlcData - Array of OHLC candle objects
   * @param {object|null} metadata - Request metadata for historical batches; null for realtime
   * @param {boolean} isLastPage - True if this is the final page of a historical query
   */
-  async writeOHLCs(topic, ohlcData) {
+  async writeOHLCs(topic, ohlcData, metadata = null, isLastPage = false) {
    if (!this.isConnected) {
      throw new Error('Kafka producer not connected');
    }
@@ -130,12 +135,8 @@ export class KafkaProducer {
      return;
    }
    // Extract metadata from first record if present
    const firstCandle = ohlcData[0];
    const metadata = firstCandle.__metadata;
    if (!metadata) {
-      // No metadata - write individual OHLC messages (realtime mode)
+      // Realtime mode — write individual OHLC messages (no batch wrapper)
      const messages = ohlcData.map(candle => {
        const protoCandle = {
          timestamp: candle.timestamp,
@@ -156,10 +157,7 @@ export class KafkaProducer {
        };
      });
-      await this.producer.send({
+      await this.producer.send({ topic, messages });
        topic,
        messages
      });
      this.logger.debug(
        { count: ohlcData.length, topic, type: 'individual' },
@@ -168,7 +166,7 @@ export class KafkaProducer {
      return;
    }
-    // Historical mode - write as OHLCBatch with metadata
+    // Historical mode — write as OHLCBatch with metadata
    const batch = {
      metadata: {
        requestId: metadata.request_id,
@@ -178,7 +176,8 @@ export class KafkaProducer {
        startTime: metadata.start_time,
        endTime: metadata.end_time,
        status: metadata.status || 'OK',
-        errorMessage: metadata.error_message
+        errorMessage: metadata.error_message,
        isLastPage
      },
      rows: ohlcData.map(candle => {
        const row = {
@@ -194,22 +193,16 @@ export class KafkaProducer {
      })
    };
    // Encode as protobuf OHLCBatch with ZMQ envelope
    const [frame1, frame2] = encodeMessage(MessageTypeId.OHLC_BATCH, batch, OHLCBatch);
    const value = Buffer.concat([frame1, frame2]);
    await this.producer.send({
      topic,
-      messages: [
+      messages: [{ key: metadata.ticker, value }]
        {
          key: metadata.ticker,
          value
        }
      ]
    });
    this.logger.debug(
-      { request_id: metadata.request_id, count: ohlcData.length, topic, type: 'batch' },
+      { request_id: metadata.request_id, count: ohlcData.length, isLastPage, topic },
      'Wrote OHLCBatch to Kafka'
    );
  }
@@ -225,7 +218,8 @@ export class KafkaProducer {
      throw new Error('Kafka producer not connected');
    }
-    // Create an empty OHLCBatch with status in metadata
+    // Create an empty OHLCBatch with status in metadata.
    // Markers are always the terminal message for a request (is_last_page = true).
    const batch = {
      metadata: {
        requestId: marker.request_id,
@@ -235,7 +229,8 @@ export class KafkaProducer {
        startTime: marker.start_time,
        endTime: marker.end_time,
        status: marker.status, // 'NOT_FOUND' or 'ERROR'
-        errorMessage: marker.error_message || marker.message
+        errorMessage: marker.error_message || marker.message,
        isLastPage: true
      },
      rows: [] // Empty rows array indicates marker message
    };
--- a/ingestor/src/realtime-poller.js
+++ b/ingestor/src/realtime-poller.js
@@ -1,33 +1,40 @@
-// Realtime tick data poller using 10-second polling
+// Realtime tick data poller — polls exchange every 10s, writes ticks to market-tick Kafka topic.
 // Heartbeats every 5s so Flink IngestorBroker knows the job is alive.
 export class RealtimePoller {
-  constructor(ccxtFetcher, kafkaProducer, logger) {
+  constructor(ccxtFetcher, kafkaProducer, zmqClient, logger) {
    this.ccxtFetcher = ccxtFetcher;
    this.kafkaProducer = kafkaProducer;
    this.zmqClient = zmqClient;
    this.logger = logger;
-    // Active subscriptions: requestId -> subscription info
+    // Active subscriptions: jobId -> subscription info
    this.subscriptions = new Map();
    // Poll interval in milliseconds (10 seconds)
    this.pollInterval = 10000;
-    // Main polling loop
+    // Heartbeat interval (5 seconds)
    this.heartbeatInterval = 5000;
    this.pollingLoop = null;
    this.heartbeatLoop = null;
  }
  /**
-   * Start a realtime subscription
+   * Start a realtime subscription for a job dispatched by IngestorBroker.
-   * @param {string} requestId - Unique request ID
+   * @param {string} jobId    - Broker-assigned job ID (for heartbeats and COMPLETE)
-   * @param {string} ticker - Ticker to subscribe to
+   * @param {string} requestId - Original request ID (for metadata)
-   * @param {string} kafkaTopic - Kafka topic to write to
+   * @param {string} ticker  - Ticker to subscribe to
   * @param {string} kafkaTopic - Kafka topic to write ticks to (market-tick)
   */
-  startSubscription(requestId, ticker, kafkaTopic) {
+  startSubscription(jobId, requestId, ticker, kafkaTopic) {
-    if (this.subscriptions.has(requestId)) {
+    if (this.subscriptions.has(jobId)) {
-      this.logger.warn({ requestId }, 'Subscription already exists');
+      this.logger.warn({ jobId }, 'Subscription already exists');
      return;
    }
    const subscription = {
      jobId,
      requestId,
      ticker,
      kafkaTopic,
@@ -36,93 +43,81 @@ export class RealtimePoller {
      errorCount: 0
    };
-    this.subscriptions.set(requestId, subscription);
+    this.subscriptions.set(jobId, subscription);
    this.logger.info({ jobId, requestId, ticker, kafkaTopic }, 'Started realtime subscription');
    this.logger.info(
      { requestId, ticker, kafkaTopic },
      'Started realtime subscription'
    );
    // Start polling loop if not already running
    if (!this.pollingLoop) {
      this.startPollingLoop();
    }
    if (!this.heartbeatLoop) {
      this.startHeartbeatLoop();
    }
  }
  /**
-   * Cancel a realtime subscription
+   * Stop a realtime subscription. Called when Flink sends WorkStop or on error.
-   * @param {string} requestId - Request ID to cancel
+   * Does NOT send WorkComplete — caller is responsible for that.
   */
-  cancelSubscription(requestId) {
+  cancelSubscription(jobId) {
-    const subscription = this.subscriptions.get(requestId);
+    const subscription = this.subscriptions.get(jobId);
    if (subscription) {
      subscription.isActive = false;
-      this.subscriptions.delete(requestId);
+      this.subscriptions.delete(jobId);
-
+      this.logger.info({ jobId, ticker: subscription.ticker }, 'Cancelled realtime subscription');
      this.logger.info(
        { requestId, ticker: subscription.ticker },
        'Cancelled realtime subscription'
      );
    }
-    // Stop polling loop if no active subscriptions
+    if (this.subscriptions.size === 0) {
-    if (this.subscriptions.size === 0 && this.pollingLoop) {
+      if (this.pollingLoop) {
-      clearInterval(this.pollingLoop);
+        clearInterval(this.pollingLoop);
-      this.pollingLoop = null;
+        this.pollingLoop = null;
-      this.logger.info('Stopped polling loop - no active subscriptions');
+      }
      if (this.heartbeatLoop) {
        clearInterval(this.heartbeatLoop);
        this.heartbeatLoop = null;
      }
      this.logger.info('Stopped polling/heartbeat loops — no active subscriptions');
    }
  }
  /**
   * Start the main polling loop
   */
  startPollingLoop() {
    this.logger.info({ interval: this.pollInterval }, 'Starting polling loop');
-
+    this.pollingLoop = setInterval(() => this.pollAllSubscriptions(), this.pollInterval);
-    this.pollingLoop = setInterval(async () => {
+    // Immediate first poll
      await this.pollAllSubscriptions();
    }, this.pollInterval);
    // Do an immediate poll
    this.pollAllSubscriptions();
  }
-  /**
+  startHeartbeatLoop() {
-   * Poll all active subscriptions
+    this.logger.info({ interval: this.heartbeatInterval }, 'Starting heartbeat loop');
-   */
+    this.heartbeatLoop = setInterval(async () => {
-  async pollAllSubscriptions() {
+      for (const { jobId } of this.subscriptions.values()) {
-    const subscriptions = Array.from(this.subscriptions.values());
+        try {
-
+          await this.zmqClient.sendHeartbeat(jobId);
-    // Poll subscriptions in parallel
+        } catch (err) {
-    await Promise.allSettled(
+          this.logger.error({ jobId, error: err.message }, 'Failed to send heartbeat');
-      subscriptions.map(sub => this.pollSubscription(sub))
+        }
-    );
+      }
    }, this.heartbeatInterval);
  }
-  /**
+  async pollAllSubscriptions() {
-   * Poll a single subscription
+    const subscriptions = Array.from(this.subscriptions.values());
-   * @param {object} subscription - Subscription object
+    await Promise.allSettled(subscriptions.map(sub => this.pollSubscription(sub)));
-   */
+  }
  async pollSubscription(subscription) {
    if (!subscription.isActive) {
      return;
    }
-    const { requestId, ticker, kafkaTopic, lastTimestamp } = subscription;
+  async pollSubscription(subscription) {
    if (!subscription.isActive) return;
    const { jobId, requestId, ticker, kafkaTopic, lastTimestamp } = subscription;
    try {
-      // Fetch trades since last timestamp
+      const trades = await this.ccxtFetcher.fetchRecentTrades(ticker, lastTimestamp);
      const trades = await this.ccxtFetcher.fetchRecentTrades(
        ticker,
        lastTimestamp
      );
      if (trades.length === 0) {
-        this.logger.debug({ requestId, ticker }, 'No new trades');
+        this.logger.debug({ jobId, ticker }, 'No new trades');
        return;
      }
-      // Filter out trades we've already seen
+      // Skip trades we've already seen (timestamp-based dedup)
      let newTrades = trades;
      if (lastTimestamp) {
        const lastTs = BigInt(lastTimestamp);
@@ -130,88 +125,59 @@ export class RealtimePoller {
      }
      if (newTrades.length > 0) {
        // Write trades to Kafka
        await this.kafkaProducer.writeTicks(kafkaTopic, newTrades);
-
+        subscription.lastTimestamp = newTrades[newTrades.length - 1].timestamp;
-        // Update last timestamp
+        this.logger.info({ jobId, ticker, count: newTrades.length, kafkaTopic }, 'Wrote ticks to Kafka');
        const latestTrade = newTrades[newTrades.length - 1];
        subscription.lastTimestamp = latestTrade.timestamp;
        this.logger.info(
          {
            requestId,
            ticker,
            count: newTrades.length,
            kafkaTopic
          },
          'Wrote new trades to Kafka'
        );
      }
      // Reset error count on success
      subscription.errorCount = 0;
    } catch (error) {
      subscription.errorCount++;
      this.logger.error(
-        {
+        { error: error.message, jobId, ticker, errorCount: subscription.errorCount },
          error: error.message,
          requestId,
          ticker,
          errorCount: subscription.errorCount
        },
        'Error polling subscription'
      );
-      // Cancel subscription after too many errors
+      // After 5 consecutive errors, give up and notify Flink
      if (subscription.errorCount >= 5) {
-        this.logger.error(
+        this.logger.error({ jobId, ticker }, 'Cancelling subscription due to repeated errors');
-          { requestId, ticker },
+        this.cancelSubscription(jobId);
-          'Cancelling subscription due to repeated errors'
+        try {
-        );
+          await this.zmqClient.sendComplete(jobId, false, `Polling failed after 5 errors: ${error.message}`);
-        this.cancelSubscription(requestId);
+        } catch (zmqErr) {
          this.logger.error({ jobId, error: zmqErr.message }, 'Failed to send WorkComplete after error');
        }
      }
    }
  }
  /**
   * Get subscription statistics
   */
  getStats() {
-    const stats = {
+    return {
      totalSubscriptions: this.subscriptions.size,
-      subscriptions: []
+      subscriptions: Array.from(this.subscriptions.values()).map(sub => ({
-    };
+        jobId: sub.jobId,
-
+        requestId: sub.requestId,
    for (const [requestId, sub] of this.subscriptions) {
      stats.subscriptions.push({
        requestId,
        ticker: sub.ticker,
        isActive: sub.isActive,
        errorCount: sub.errorCount,
        lastTimestamp: sub.lastTimestamp
-      });
+      }))
-    }
+    };
    return stats;
  }
  /**
   * Shutdown poller and cancel all subscriptions
   */
  shutdown() {
    this.logger.info('Shutting down realtime poller');
    if (this.pollingLoop) {
      clearInterval(this.pollingLoop);
      this.pollingLoop = null;
    }
-
+    if (this.heartbeatLoop) {
-    // Mark all subscriptions as inactive
+      clearInterval(this.heartbeatLoop);
      this.heartbeatLoop = null;
    }
    for (const subscription of this.subscriptions.values()) {
      subscription.isActive = false;
    }
    this.subscriptions.clear();
  }
 }
--- a/ingestor/src/zmq-client.js
+++ b/ingestor/src/zmq-client.js
@@ -1,116 +1,204 @@
-// ZeroMQ client for connecting to Flink control channels
+// ZeroMQ DEALER client connecting to Flink IngestorBroker (ROUTER, port 5567)
 import * as zmq from 'zeromq';
-import { decodeMessage } from './proto/messages.js';
+import {
  DataRequest,
  WorkerReady, WorkComplete, WorkHeartbeat, WorkReject, WorkStop,
  MessageTypeId, PROTOCOL_VERSION
 } from './proto/messages.js';
 const PROTOCOL_VERSION_BUF = Buffer.from([PROTOCOL_VERSION]);
 /**
 * Encodes a broker protocol message for sending over DEALER.
 * Frame layout (DEALER → ROUTER):
 *   Frame 0: empty delimiter (required for ROUTER peering)
 *   Frame 1: [0x01] version
 *   Frame 2: [typeId][protobuf bytes]
 */
 function encodeBrokerMessage(typeId, messageData, MessageType) {
  const protoBytes = MessageType.encode(MessageType.create(messageData)).finish();
  const frame2 = Buffer.concat([Buffer.from([typeId]), Buffer.from(protoBytes)]);
  return [Buffer.alloc(0), PROTOCOL_VERSION_BUF, frame2];
 }
 export class ZmqClient {
  constructor(config, logger) {
    this.config = config;
    this.logger = logger;
-    // Work queue - SUB socket to receive data requests with exchange prefix filtering
+    this.dealerSocket = null;
    this.workSocket = null;
    // NOTE: NO RESPONSE SOCKET - Async architecture via Kafka!
    // Ingestors write data to Kafka only
    // Flink processes and publishes notifications
    this.isShutdown = false;
-    this.supportedExchanges = config.supported_exchanges || ['BINANCE', 'COINBASE'];
+    this.activeJobId = null;
    this._idleHeartbeatInterval = null;
    this.supportedExchanges = (config.supported_exchanges || ['BINANCE', 'COINBASE'])
      .map(e => e.toUpperCase());
    // Callbacks set by IngestorWorker
    this.onWorkAssign = null; // (DataRequest) => void
    this.onWorkStop = null;   // (jobId) => void
  }
  /**
-   * Connect to Relay ZMQ endpoints
+   * Connect DEALER socket to Flink IngestorBroker (ROUTER).
   * Sends WorkerReady immediately so Flink knows this worker is available.
   */
  async connect() {
-    const { flink_hostname, ingestor_work_port } = this.config;
+    const { flink_hostname, ingestor_broker_port = 5567 } = this.config;
-    // Connect to work queue (SUB with exchange prefix filtering)
+    this.dealerSocket = new zmq.Dealer();
-    this.workSocket = new zmq.Subscriber();
+    const endpoint = `tcp://${flink_hostname}:${ingestor_broker_port}`;
-    const workEndpoint = `tcp://${flink_hostname}:${ingestor_work_port}`;
+    await this.dealerSocket.connect(endpoint);
-    await this.workSocket.connect(workEndpoint);
+    this.logger.info(`Connected DEALER to Flink IngestorBroker at ${endpoint}`);
-    // Subscribe to each supported exchange suffix (Nautilus format: "BTC/USDT.BINANCE")
+    // Register as available
-    for (const exchange of this.supportedExchanges) {
+    await this.sendReady();
-      const prefix = `${exchange}.`;
+
-      this.workSocket.subscribe(prefix);
+    // Periodically re-send WorkerReady when idle, to recover from missed initial registration
-      this.logger.info(`Subscribed to exchange prefix: ${prefix}`);
+    this._idleHeartbeatInterval = setInterval(() => {
-    }
+      if (this.activeJobId === null && !this.isShutdown) {
-    this.logger.info(`Connected to work queue at ${workEndpoint}`);
+        this.sendReady().catch(err =>
-    this.logger.info('ASYNC MODE: No response socket - data flows via Kafka → Flink → pub/sub notification');
+          this.logger.warn({ error: err.message }, 'Failed to re-send WorkerReady'));
      }
    }, 30_000);
    // Start receiving work in background
    this._receiveLoop();
  }
  /**
-   * Pull a data request from the work queue
+   * Send WorkerReady — called on connect and after each COMPLETE.
   * @returns {Promise<object>} Decoded DataRequest message
   */
-  async pullDataRequest() {
+  async sendReady() {
-    if (this.isShutdown) {
+    const frames = encodeBrokerMessage(
-      return null;
+      MessageTypeId.WORKER_READY,
-    }
+      { exchanges: this.supportedExchanges },
      WorkerReady
    );
    await this.dealerSocket.send(frames);
    this.logger.info({ exchanges: this.supportedExchanges }, 'Sent WorkerReady');
  }
  /**
   * Send WorkComplete after a historical job finishes.
   * Automatically sends WorkerReady so Flink returns us to the free pool.
   */
  async sendComplete(jobId, success, errorMessage) {
    this.activeJobId = null;
    const frames = encodeBrokerMessage(
      MessageTypeId.WORK_COMPLETE,
      {
        jobId,
        success,
        ...(errorMessage ? { errorMessage } : {})
      },
      WorkComplete
    );
    await this.dealerSocket.send(frames);
    this.logger.info({ jobId, success }, 'Sent WorkComplete');
    // Return to free pool
    await this.sendReady();
  }
  /**
   * Send WorkHeartbeat for an active realtime job.
   */
  async sendHeartbeat(jobId) {
    const frames = encodeBrokerMessage(
      MessageTypeId.WORK_HEARTBEAT,
      { jobId },
      WorkHeartbeat
    );
    await this.dealerSocket.send(frames);
    this.logger.debug({ jobId }, 'Sent WorkHeartbeat');
  }
  /**
   * Send WorkReject if we cannot handle the dispatched job.
   */
  async sendReject(jobId, reason) {
    const frames = encodeBrokerMessage(
      MessageTypeId.WORK_REJECT,
      { jobId, reason },
      WorkReject
    );
    await this.dealerSocket.send(frames);
    this.logger.warn({ jobId, reason }, 'Sent WorkReject');
  }
  /**
   * Background loop: receive WorkAssign (DataRequest) or WorkStop from Flink.
   * ROUTER→DEALER frame layout: [empty][version][typeId+payload]
   */
  async _receiveLoop() {
    try {
-      const frames = await this.workSocket.receive();
+      for await (const frames of this.dealerSocket) {
-      this.logger.info({
+        if (this.isShutdown) break;
        frameCount: frames.length,
        frame0Len: frames[0]?.length,
        frame1Len: frames[1]?.length,
        frame2Len: frames[2]?.length,
        frame0: frames[0]?.toString('utf8').substring(0, 50),
        frame1Hex: frames[1]?.toString('hex').substring(0, 20),
        frame2Hex: frames[2]?.toString('hex').substring(0, 20)
      }, 'Received raw ZMQ frames');
-      // First frame is the topic (exchange prefix), skip it
+        try {
-      // Remaining frames are: [version_frame, message_frame]
+          // frames[0] = empty delimiter, frames[1] = version, frames[2] = type+payload
-      if (frames.length < 3) {
+          if (frames.length < 3) {
-        this.logger.warn({ frameCount: frames.length }, 'Unexpected frame count');
+            this.logger.warn({ frameCount: frames.length }, 'Unexpected frame count from broker');
-        return null;
+            continue;
          }
          const versionByte = frames[1][0];
          if (versionByte !== PROTOCOL_VERSION) {
            this.logger.warn({ versionByte }, 'Unexpected protocol version from broker');
            continue;
          }
          const typeId = frames[2][0];
          const payload = frames[2].slice(1);
          if (typeId === MessageTypeId.WORK_ASSIGN) {
            // DataRequest protobuf
            const request = DataRequest.decode(payload);
            const req = DataRequest.toObject(request, {
              longs: String, enums: String, bytes: Buffer
            });
            this.activeJobId = req.jobId;
            this.logger.info(
              { jobId: req.jobId, requestId: req.requestId, type: req.type, ticker: req.ticker },
              'Received WorkAssign from broker'
            );
            if (this.onWorkAssign) {
              this.onWorkAssign(req);
            }
          } else if (typeId === MessageTypeId.WORK_STOP) {
            const stop = WorkStop.decode(payload);
            const { jobId } = WorkStop.toObject(stop);
            this.logger.info({ jobId }, 'Received WorkStop from broker');
            if (this.onWorkStop) {
              this.onWorkStop(jobId);
            }
          } else {
            this.logger.warn({ typeId: `0x${typeId.toString(16)}` }, 'Unknown message type from broker');
          }
        } catch (err) {
          this.logger.error({ error: err.message }, 'Error processing broker message');
        }
      }
-      const messageFrames = frames.slice(1); // Skip topic, keep version + message
+    } catch (err) {
      const { version, typeId, message } = decodeMessage(messageFrames);
      this.logger.info({
        version,
        typeId: `0x${typeId.toString(16)}`,
        requestId: message.requestId,
        type: message.type,
        typeOf: typeof message.type,
        ticker: message.ticker
      }, 'Decoded data request');
      return message;
    } catch (error) {
      if (!this.isShutdown) {
-        this.logger.error({ error: error.message, stack: error.stack }, 'Error receiving data request');
+        this.logger.error({ error: err.message }, 'DEALER receive loop error');
      }
      return null;
    }
  }
  /**
   * Start listening for control messages in the background
   * @param {Function} handler - Callback function to handle control messages
   *
   * NOTE: Control channel not implemented yet. This is a stub for future use.
   * For now, just log and ignore.
   */
  startControlListener(handler) {
    this.logger.info('Control channel listener stub - not implemented yet');
    // TODO: Implement control channel when needed
    // Control messages would be used for:
    // - Canceling realtime subscriptions
    // - Graceful shutdown signals
    // - Configuration updates
  }
  /**
   * Shutdown and close connections
   */
  async shutdown() {
    this.isShutdown = true;
-    this.logger.info('Shutting down ZMQ connections');
+    if (this._idleHeartbeatInterval) {
-
+      clearInterval(this._idleHeartbeatInterval);
-    if (this.workSocket) {
+      this._idleHeartbeatInterval = null;
-      await this.workSocket.close();
+    }
    this.logger.info('Shutting down ZMQ DEALER connection');
    if (this.dealerSocket) {
      this.dealerSocket.close();
    }
  }
 }
--- a/protobuf/ingestor.proto
+++ b/protobuf/ingestor.proto
@@ -24,6 +24,9 @@ message DataRequest {
  // Flink uses this to determine notification topic
  optional string client_id = 6;
  // Job ID assigned by the IngestorBroker for work tracking and heartbeating
  optional string job_id = 7;
  enum RequestType {
    HISTORICAL_OHLC = 0;
    REALTIME_TICKS = 1;
@@ -327,3 +330,40 @@ message FieldValue {
    uint64 timestamp_val = 6;
  }
 }
 // ─── Ingestor Broker Protocol (Flink ROUTER ↔ Ingestor DEALER, port 5567) ───
 // Message type IDs 0x20–0x25
 // Ingestor → Flink: register as available (type 0x20)
 // Sent on DEALER connect and after every COMPLETE.
 message WorkerReady {
  // Exchanges this ingestor supports (e.g. ["BINANCE", "COINBASE"])
  repeated string exchanges = 1;
 }
 // Ingestor → Flink: historical job finished (type 0x21)
 message WorkComplete {
  string job_id = 1;
  bool success = 2;
  optional string error_message = 3;
 }
 // Ingestor → Flink: realtime job still alive — sent every 5s (type 0x22)
 message WorkHeartbeat {
  string job_id = 1;
 }
 // Ingestor → Flink: unable to handle this job (type 0x23)
 message WorkReject {
  string job_id = 1;
  string reason = 2;
 }
 // Flink → Ingestor: dispatch a job — wraps DataRequest (type 0x24)
 // DataRequest.job_id is populated by IngestorBroker
 // (DataRequest itself is type 0x01; this is the framing type for broker dispatch)
 // Flink → Ingestor: stop a realtime job (type 0x25)
 message WorkStop {
  string job_id = 1;
 }
--- a/protobuf/ohlc.proto
+++ b/protobuf/ohlc.proto
@@ -58,4 +58,8 @@ message OHLCBatchMetadata {
  // Error message if status is ERROR
  optional string error_message = 8;
  // True on the final page of a historical query (including error/not-found markers).
  // Flink publishes HistoryReadyNotification only when this is true.
  bool is_last_page = 9;
 }
--- a/relay/src/config.rs
+++ b/relay/src/config.rs
@@ -16,14 +16,15 @@ pub struct Config {
    #[serde(default = "default_market_data_pub_port")]
    pub market_data_pub_port: u16,
-    /// Ingestor work queue port (PUB - publish work with exchange prefix)
+    /// Flink market data endpoint (XSUB - relay subscribes to Flink XPUB)
    #[serde(default = "default_ingestor_work_port")]
    pub ingestor_work_port: u16,
    /// Flink market data endpoint (XSUB - relay subscribes to Flink)
    #[serde(default = "default_flink_market_data_endpoint")]
    pub flink_market_data_endpoint: String,
    /// Flink request endpoint (PUSH - relay forwards client requests to Flink PULL)
    /// Flink's IngestorBroker binds a PULL socket on port 5566
    #[serde(default = "default_flink_request_endpoint")]
    pub flink_request_endpoint: String,
    /// Request timeout in seconds
    #[serde(default = "default_request_timeout_secs")]
    pub request_timeout_secs: u64,
@@ -45,14 +46,14 @@ fn default_market_data_pub_port() -> u16 {
    5558
 }
 fn default_ingestor_work_port() -> u16 {
    5555
 }
 fn default_flink_market_data_endpoint() -> String {
    "tcp://flink-jobmanager:5558".to_string()
 }
 fn default_flink_request_endpoint() -> String {
    "tcp://flink-jobmanager:5566".to_string()
 }
 fn default_request_timeout_secs() -> u64 {
    30
 }
@@ -67,8 +68,8 @@ impl Default for Config {
            bind_address: default_bind_address(),
            client_request_port: default_client_request_port(),
            market_data_pub_port: default_market_data_pub_port(),
            ingestor_work_port: default_ingestor_work_port(),
            flink_market_data_endpoint: default_flink_market_data_endpoint(),
            flink_request_endpoint: default_flink_request_endpoint(),
            request_timeout_secs: default_request_timeout_secs(),
            high_water_mark: default_hwm(),
        }
--- a/relay/src/relay.rs
+++ b/relay/src/relay.rs
@@ -7,8 +7,6 @@ use tracing::{debug, error, info, warn};
 const PROTOCOL_VERSION: u8 = 0x01;
 const MSG_TYPE_SUBMIT_REQUEST: u8 = 0x10;
 const MSG_TYPE_SUBMIT_RESPONSE: u8 = 0x11;
 const MSG_TYPE_DATA_REQUEST: u8 = 0x01;
 const MSG_TYPE_HISTORY_READY: u8 = 0x12;
 pub struct Relay {
    config: Config,
@@ -26,24 +24,21 @@ impl Relay {
    }
    pub async fn run(self) -> Result<()> {
-        info!("Initializing Stateless ZMQ Relay");
+        info!("Initializing ZMQ Relay");
        // Bind sockets
        let client_request_socket = self.create_client_request_socket()?;
        let market_data_frontend = self.create_market_data_frontend()?;
        let market_data_backend = self.create_market_data_backend()?;
-        let ingestor_work_socket = self.create_ingestor_work_socket()?;
+        let flink_request_socket = self.create_flink_request_socket()?;
-        info!("All sockets initialized successfully - relay is STATELESS");
+        info!("All sockets initialized — relay forwards requests to Flink");
        info!("No pending requests tracked - all async via pub/sub");
        // Run main loop
        tokio::task::spawn_blocking(move || {
            Self::proxy_loop(
                client_request_socket,
                market_data_frontend,
                market_data_backend,
-                ingestor_work_socket,
+                flink_request_socket,
            )
        })
        .await?
@@ -58,7 +53,6 @@ impl Relay {
        let endpoint = format!("{}:{}", self.config.bind_address, self.config.client_request_port);
        socket.bind(&endpoint)?;
        info!("Client request socket (ROUTER) bound to {}", endpoint);
        info!("  → Accepts SubmitHistoricalRequest, returns SubmitResponse immediately");
        Ok(socket)
    }
@@ -71,7 +65,7 @@ impl Relay {
        let endpoint = format!("{}:{}", self.config.bind_address, self.config.market_data_pub_port);
        socket.bind(&endpoint)?;
        info!("Market data frontend (XPUB) bound to {}", endpoint);
-        info!("  → Clients subscribe here for HistoryReadyNotification and market data");
+        info!("  → Clients subscribe here; subscription events forwarded to Flink for realtime activation");
        Ok(socket)
    }
@@ -82,20 +76,19 @@ impl Relay {
        socket.connect(&self.config.flink_market_data_endpoint)?;
        info!("Market data backend (XSUB) connected to {}", self.config.flink_market_data_endpoint);
-        info!("  → Receives HistoryReadyNotification and market data from Flink");
+        info!("  → Receives market data and notifications from Flink");
        Ok(socket)
    }
-    fn create_ingestor_work_socket(&self) -> Result<zmq::Socket> {
+    fn create_flink_request_socket(&self) -> Result<zmq::Socket> {
-        let socket = self.context.socket(zmq::PUB)?;
+        let socket = self.context.socket(zmq::PUSH)?;
        socket.set_sndhwm(self.config.high_water_mark)?;
        socket.set_linger(1000)?;
-        let endpoint = format!("{}:{}", self.config.bind_address, self.config.ingestor_work_port);
+        socket.connect(&self.config.flink_request_endpoint)?;
-        socket.bind(&endpoint)?;
+        info!("Flink request socket (PUSH) connected to {}", self.config.flink_request_endpoint);
-        info!("Ingestor work queue (PUB) bound to {}", endpoint);
+        info!("  → Forwards SubmitHistoricalRequest to Flink for dispatch to ingestors");
        info!("  → Publishes DataRequest with exchange prefix");
        Ok(socket)
    }
@@ -104,7 +97,7 @@ impl Relay {
        client_request_socket: zmq::Socket,
        market_data_frontend: zmq::Socket,
        market_data_backend: zmq::Socket,
-        ingestor_work_socket: zmq::Socket,
+        flink_request_socket: zmq::Socket,
    ) -> Result<()> {
        let mut items = [
            client_request_socket.as_poll_item(zmq::POLLIN),
@@ -112,10 +105,9 @@ impl Relay {
            market_data_backend.as_poll_item(zmq::POLLIN),
        ];
-        info!("Entering stateless proxy loop");
+        info!("Entering relay proxy loop");
        loop {
            // Poll with 100ms timeout
            zmq::poll(&mut items, 100)
                .context("Failed to poll sockets")?;
@@ -123,21 +115,20 @@ impl Relay {
            if items[0].is_readable() {
                if let Err(e) = Self::handle_client_submission(
                    &client_request_socket,
-                    &ingestor_work_socket,
+                    &flink_request_socket,
                ) {
                    error!("Error handling client submission: {}", e);
                }
            }
-            // Handle market data subscriptions from clients (XPUB → XSUB)
+            // Proxy client subscription events → Flink (XPUB → XSUB)
            if items[1].is_readable() {
                if let Err(e) = Self::proxy_subscription(&market_data_frontend, &market_data_backend) {
                    error!("Error proxying subscription: {}", e);
                }
            }
-            // Handle market data from Flink (XSUB → XPUB)
+            // Proxy market data from Flink → clients (XSUB → XPUB)
            // This includes HistoryReadyNotification and regular market data
            if items[2].is_readable() {
                if let Err(e) = Self::proxy_market_data(&market_data_backend, &market_data_frontend) {
                    error!("Error proxying market data: {}", e);
@@ -148,7 +139,7 @@ impl Relay {
    fn handle_client_submission(
        client_socket: &zmq::Socket,
-        ingestor_socket: &zmq::Socket,
+        flink_socket: &zmq::Socket,
    ) -> Result<()> {
        // Receive from client: [identity][empty][version][message]
        let identity = client_socket.recv_bytes(0)?;
@@ -177,7 +168,7 @@ impl Relay {
                    identity,
                    payload,
                    client_socket,
-                    ingestor_socket,
+                    flink_socket,
                )?;
            }
            _ => {
@@ -192,61 +183,27 @@ impl Relay {
        client_identity: Vec<u8>,
        payload: &[u8],
        client_socket: &zmq::Socket,
-        ingestor_socket: &zmq::Socket,
+        flink_socket: &zmq::Socket,
    ) -> Result<()> {
-        // Parse protobuf request
+        // Parse just enough to build the SubmitResponse — relay stays thin
        let request = proto::SubmitHistoricalRequest::decode(payload)
            .context("Failed to parse SubmitHistoricalRequest")?;
        let request_id = request.request_id.clone();
        let ticker = request.ticker.clone();
        let client_id = request.client_id.clone();
-        info!("Handling request submission: request_id={}, ticker={}, client_id={:?}",
+        info!("Forwarding request to Flink: request_id={}, ticker={}", request_id, request.ticker);
            request_id, ticker, client_id);
-        // Extract exchange suffix from ticker (Nautilus format: "BTC/USDT.BINANCE")
+        // Forward the raw request to Flink via PUSH
-        let exchange_prefix = ticker.rsplitn(2, '.').next()
+        // Flink builds DataRequest and dispatches to ingestors via IngestorBroker
            .map(|s| format!("{}.", s))
            .unwrap_or_else(|| String::from(""));
        if exchange_prefix.is_empty() {
            warn!("Ticker '{}' missing exchange suffix", ticker);
        }
        // Build DataRequest protobuf for ingestors
        let data_request = proto::DataRequest {
            request_id: request_id.clone(),
            r#type: proto::data_request::RequestType::HistoricalOhlc as i32,
            ticker: ticker.clone(),
            historical: Some(proto::HistoricalParams {
                start_time: request.start_time,
                end_time: request.end_time,
                period_seconds: request.period_seconds,
                limit: request.limit,
            }),
            realtime: None,
            client_id: client_id.clone(),
        };
        let mut data_request_bytes = Vec::new();
        data_request.encode(&mut data_request_bytes)?;
        // Publish to ingestors with exchange prefix
        let version_frame = vec![PROTOCOL_VERSION];
-        let mut message_frame = vec![MSG_TYPE_DATA_REQUEST];
+        let mut message_frame = vec![MSG_TYPE_SUBMIT_REQUEST];
-        message_frame.extend_from_slice(&data_request_bytes);
+        message_frame.extend_from_slice(payload);
-        ingestor_socket.send(&exchange_prefix, zmq::SNDMORE)?;
+        flink_socket.send(&version_frame, zmq::SNDMORE)?;
-        ingestor_socket.send(&version_frame, zmq::SNDMORE)?;
+        flink_socket.send(&message_frame, 0)?;
        ingestor_socket.send(&message_frame, 0)?;
-        info!("Published to ingestors: prefix={}, request_id={}", exchange_prefix, request_id);
+        // Build SubmitResponse — relay still acks the client immediately
        // Build SubmitResponse protobuf
        // NOTE: This topic is DETERMINISTIC based on client-generated values.
        // Client should have already subscribed to this topic BEFORE sending the request
        // to prevent race condition where notification arrives before client subscribes.
        let notification_topic = if let Some(cid) = &client_id {
            format!("RESPONSE:{}", cid)
        } else {
@@ -263,20 +220,16 @@ impl Relay {
        let mut response_bytes = Vec::new();
        response.encode(&mut response_bytes)?;
        // Send immediate response to client
        let version_frame = vec![PROTOCOL_VERSION];
-        let mut message_frame = vec![MSG_TYPE_SUBMIT_RESPONSE];
+        let mut resp_message_frame = vec![MSG_TYPE_SUBMIT_RESPONSE];
-        message_frame.extend_from_slice(&response_bytes);
+        resp_message_frame.extend_from_slice(&response_bytes);
        client_socket.send(&client_identity, zmq::SNDMORE)?;
        client_socket.send(&[] as &[u8], zmq::SNDMORE)?;
        client_socket.send(&version_frame, zmq::SNDMORE)?;
-        client_socket.send(&message_frame, 0)?;
+        client_socket.send(&resp_message_frame, 0)?;
-        info!("Sent SubmitResponse to client: request_id={}, topic={}", request_id, notification_topic);
+        info!("Acked client and forwarded to Flink: request_id={}, notification_topic={}", request_id, notification_topic);
        // Relay is now DONE with this request - completely stateless!
        // Client will receive notification via pub/sub when Flink publishes HistoryReadyNotification
        Ok(())
    }
@@ -285,7 +238,7 @@ impl Relay {
        frontend: &zmq::Socket,
        backend: &zmq::Socket,
    ) -> Result<()> {
-        // Forward subscription message from XPUB to XSUB
+        // Forward subscription event from XPUB to XSUB so Flink can detect realtime interest
        let msg = frontend.recv_bytes(0)?;
        backend.send(&msg, 0)?;
@@ -302,10 +255,7 @@ impl Relay {
        backend: &zmq::Socket,
        frontend: &zmq::Socket,
    ) -> Result<()> {
-        // Forward all messages from XSUB to XPUB (zero-copy proxy)
+        // Zero-copy proxy: XSUB (Flink) → XPUB (clients)
        // This includes:
        // - Regular market data (ticks, OHLC)
        // - HistoryReadyNotification from Flink
        loop {
            let msg = backend.recv_bytes(0)?;
            let more = backend.get_rcvmore()?;
--- a/sandbox/Dockerfile
+++ b/sandbox/Dockerfile
@@ -11,7 +11,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    && rm -rf /var/lib/apt/lists/*
 # Copy dependency specifications
 COPY setup.py .
 COPY environment.yml .
 COPY dexorder/ dexorder/
@@ -27,9 +26,6 @@ RUN mkdir -p dexorder/generated && \
 RUN conda env create -f environment.yml -p /build/env && \
    conda clean -afy
 # Install the local package into the conda environment
 RUN /build/env/bin/pip install --no-cache-dir .
 # =============================================================================
 # Runtime stage
 # =============================================================================
@@ -75,7 +71,8 @@ RUN chmod 755 /app/entrypoint.sh && chown root:root /app/entrypoint.sh
 USER dexorder
 # Environment variables (can be overridden in k8s)
-ENV PYTHONUNBUFFERED=1 \
+ENV PYTHONPATH=/app \
    PYTHONUNBUFFERED=1 \
    MPLCONFIGDIR=/tmp \
    NUMBA_CACHE_DIR=/tmp/numba_cache \
    LOG_LEVEL=INFO \
--- a/sandbox/dexorder/api/init.py
+++ b/sandbox/dexorder/api/init.py
@@ -12,6 +12,7 @@ For research scripts, import and use get_api() to access the API:
 """
 import logging
 import threading
 from typing import Optional
 from dexorder.api.api import API
@@ -23,10 +24,13 @@ log = logging.getLogger(__name__)
 # Global API instance - managed by main.py
 _global_api: Optional[API] = None
 # Thread-local API — used by harness threads so they don't overwrite the global
 _thread_local = threading.local()
 def get_api() -> API:
    """
-    Get the global API instance for accessing market data and charts.
+    Get the API instance for accessing market data and charts.
    Use this in research scripts to access the data and charting APIs.
@@ -53,15 +57,27 @@ def get_api() -> API:
        # Create chart
        fig, ax = api.charting.plot_ohlc(df, title="BTC/USDT")
    """
    # Thread-local takes priority (set by harness threads)
    api = getattr(_thread_local, 'api', None)
    if api is not None:
        return api
    if _global_api is None:
        raise RuntimeError("API not initialized")
    return _global_api
 def set_api(api: API) -> None:
-    """Set the global API instance. Internal use only."""
+    """Set the API instance.
-    global _global_api
+
-    _global_api = api
+    When called from the main thread, sets the global API used by all threads.
    When called from a non-main thread (e.g. harness threads), sets a thread-local
    API so the global is not overwritten.
    """
    if threading.current_thread() is threading.main_thread():
        global _global_api
        _global_api = api
    else:
        _thread_local.api = api
 __all__ = ['API', 'ChartingAPI', 'DataAPI', 'get_api', 'set_api']
--- a/sandbox/dexorder/conda_manager.py
+++ b/sandbox/dexorder/conda_manager.py
@@ -3,6 +3,12 @@ Conda Package Manager
 Manages dynamic installation and cleanup of conda packages for user components.
 Scans metadata files to determine required packages and syncs the conda environment.
 Extra packages (user-installed beyond the base container) are tracked in
 ``extra_packages.json`` under ``data_dir`` so they can be removed when no
 script references them. Packages that are later promoted into the base image
 (i.e. appear in ``environment.yml``) are silently evicted from tracking
 rather than uninstalled.
 """
 import json
@@ -12,6 +18,10 @@ import sys
 from pathlib import Path
 from typing import Optional, Set
 # Filename (stored under data_dir, outside the git repo) for tracking
 # user-installed extra packages.
 EXTRA_PACKAGES_FILENAME = "extra_packages.json"
 log = logging.getLogger(__name__)
@@ -102,12 +112,35 @@ def get_installed_packages() -> Set[str]:
        return set()
-def install_packages(packages: list[str]) -> dict:
+def load_extra_packages(data_dir: Path) -> Set[str]:
    """Load the set of user-installed extra packages (beyond the base container)."""
    path = data_dir / EXTRA_PACKAGES_FILENAME
    if path.exists():
        try:
            return set(json.loads(path.read_text()))
        except Exception as e:
            log.error(f"Failed to load extra packages: {e}")
    return set()
 def save_extra_packages(data_dir: Path, packages: Set[str]) -> None:
    """Persist the set of user-installed extra packages."""
    path = data_dir / EXTRA_PACKAGES_FILENAME
    try:
        path.write_text(json.dumps(sorted(packages)))
    except Exception as e:
        log.error(f"Failed to save extra packages: {e}")
 def install_packages(packages: list[str], data_dir: Optional[Path] = None) -> dict:
    """
    Install conda packages if not already installed.
    Args:
        packages: List of package names to install
        data_dir: If provided, newly installed packages are added to the extra
                  package tracking file (``extra_packages.json``) so they can
                  be cleaned up when no longer needed.
    Returns:
        dict with:
@@ -154,6 +187,10 @@ def install_packages(packages: list[str]) -> dict:
        if result.returncode == 0:
            log.info(f"Successfully installed packages: {to_install}")
            if data_dir:
                extras = load_extra_packages(data_dir)
                extras.update(to_install)
                save_extra_packages(data_dir, extras)
            return {
                "success": True,
                "installed": to_install,
@@ -324,9 +361,59 @@ def get_base_packages(environment_yml: Path) -> Set[str]:
 # =============================================================================
-# Sync Operation
+# Cleanup and Sync Operations
 # =============================================================================
 def cleanup_extra_packages(data_dir: Path, environment_yml: Optional[Path] = None) -> dict:
    """
    Remove tracked extra packages that are no longer needed by any script.
    Only packages previously recorded in ``extra_packages.json`` are ever
    considered for removal — base container packages are never touched.
    Packages that have since been promoted into the base container image
    (i.e. now appear in ``environment.yml``) are quietly evicted from the
    tracking file without being uninstalled.
    Args:
        data_dir: Base data directory (tracking file lives here)
        environment_yml: Path to environment.yml for base package reconciliation
    Returns:
        dict with:
        - success: bool
        - to_remove: list[str] - packages identified for removal
        - removed: list[str] - packages actually removed
        - error: str (if any)
    """
    src_dir = data_dir / "src"
    required = scan_metadata_packages(src_dir)
    base = get_base_packages(environment_yml) if environment_yml and environment_yml.exists() else set()
    extras = load_extra_packages(data_dir)
    # Packages promoted into the base image are no longer "extra" — evict from tracking
    now_base = extras & base
    if now_base:
        log.info(f"Packages promoted to base image, evicting from extra tracking: {now_base}")
        extras -= now_base
    # Only remove packages that are tracked as extras and no longer referenced by any script
    to_remove = sorted(extras - required)
    result: dict = {"success": True, "to_remove": to_remove, "removed": []}
    if to_remove:
        remove_result = remove_packages(to_remove)
        result["success"] = remove_result["success"]
        result["removed"] = remove_result.get("removed", [])
        if remove_result["success"]:
            extras -= set(to_remove)
        else:
            result["error"] = remove_result.get("error")
    save_extra_packages(data_dir, extras)
    return result
 def sync_packages(data_dir: Path, environment_yml: Optional[Path] = None) -> dict:
    """
    Sync conda packages with metadata requirements.
@@ -350,8 +437,8 @@ def sync_packages(data_dir: Path, environment_yml: Optional[Path] = None) -> dic
    """
    log.info("Starting conda package sync")
-    # Get required packages from metadata
+    # Metadata lives under data_dir/src/category/item/metadata.json
-    required_packages = scan_metadata_packages(data_dir)
+    required_packages = scan_metadata_packages(data_dir / "src")
    log.info(f"Required packages from metadata: {required_packages}")
    # Get base packages from environment.yml
--- a/sandbox/dexorder/iceberg_client.py
+++ b/sandbox/dexorder/iceberg_client.py
@@ -42,6 +42,7 @@ class IcebergClient:
        s3_endpoint: Optional[str] = None,
        s3_access_key: Optional[str] = None,
        s3_secret_key: Optional[str] = None,
        s3_region: Optional[str] = None,
    ):
        """
        Initialize Iceberg client.
@@ -52,6 +53,7 @@ class IcebergClient:
            s3_endpoint: S3/MinIO endpoint URL (e.g., "http://localhost:9000")
            s3_access_key: S3/MinIO access key
            s3_secret_key: S3/MinIO secret key
            s3_region: S3/MinIO region (e.g., "us-east-1")
        """
        self.catalog_uri = catalog_uri
        self.namespace = namespace
@@ -64,6 +66,8 @@ class IcebergClient:
            catalog_props["s3.access-key-id"] = s3_access_key
        if s3_secret_key:
            catalog_props["s3.secret-access-key"] = s3_secret_key
        if s3_region:
            catalog_props["s3.region"] = s3_region
        self.catalog = load_catalog("trading", **catalog_props)
        self.table = self.catalog.load_table(f"{namespace}.ohlc")
--- a/sandbox/dexorder/impl/data_api_impl.py
+++ b/sandbox/dexorder/impl/data_api_impl.py
@@ -15,6 +15,13 @@ log = logging.getLogger(__name__)
 # Standard OHLC columns always returned
 STANDARD_COLUMNS = ["timestamp", "open", "high", "low", "close"]
 # All optional columns from the OHLC proto spec, returned by default when extra_columns=None
 OHLC_OPTIONAL_COLUMNS = [
    "volume", "buy_vol", "sell_vol",
    "open_time", "high_time", "low_time", "close_time",
    "open_interest",
 ]
 # All valid extra columns available in the Iceberg schema
 VALID_EXTRA_COLUMNS = {
    "volume", "buy_vol", "sell_vol",
@@ -43,6 +50,7 @@ class DataAPIImpl(DataAPI):
        s3_endpoint: Optional[str] = None,
        s3_access_key: Optional[str] = None,
        s3_secret_key: Optional[str] = None,
        s3_region: Optional[str] = None,
        request_timeout: float = 30.0,
    ):
        """
@@ -56,6 +64,7 @@ class DataAPIImpl(DataAPI):
            s3_endpoint: S3/MinIO endpoint URL (e.g., "http://minio:9000")
            s3_access_key: S3/MinIO access key
            s3_secret_key: S3/MinIO secret key
            s3_region: S3/MinIO region (e.g., "us-east-1")
            request_timeout: Default timeout for historical data requests in seconds (default: 30)
        """
        self.ohlc_client = OHLCClient(
@@ -66,6 +75,7 @@ class DataAPIImpl(DataAPI):
            s3_endpoint=s3_endpoint,
            s3_access_key=s3_access_key,
            s3_secret_key=s3_secret_key,
            s3_region=s3_region,
        )
        self.request_timeout = request_timeout
        self._started = False
@@ -120,7 +130,9 @@ class DataAPIImpl(DataAPI):
        # Determine which columns to fetch
        columns_to_fetch = STANDARD_COLUMNS.copy()
-        if extra_columns:
+        if extra_columns is None:
            columns_to_fetch.extend(OHLC_OPTIONAL_COLUMNS)
        elif extra_columns:
            columns_to_fetch.extend(extra_columns)
        # Use OHLCClient which handles smart caching:
--- a/sandbox/dexorder/nautilus/backtest_runner.py
+++ b/sandbox/dexorder/nautilus/backtest_runner.py
@@ -93,6 +93,82 @@ def _load_strategy_class(impl_path: Path) -> type:
 # Metrics extraction
 # ---------------------------------------------------------------------------
 def _money_to_float(val) -> float | None:
    """Convert a Nautilus Money object or string like '15.32 USDT' to float."""
    if val is None:
        return None
    try:
        if hasattr(val, "as_decimal"):
            return float(val.as_decimal())
        s = str(val).strip()
        if s and s.lower() not in ("none", "nan"):
            return float(s.split()[0])
    except (ValueError, TypeError, IndexError):
        pass
    return None
 def _ts_to_s(raw) -> int | None:
    """Convert a Nautilus nanosecond timestamp to Unix seconds."""
    try:
        return int(raw) // 1_000_000_000
    except (TypeError, ValueError):
        return None
 def _extract_fills(engine) -> pd.DataFrame:
    """Return a sorted fills DataFrame from BacktestEngine, or empty DataFrame."""
    try:
        df = engine.trader.generate_order_fills_report()
        if df is not None and len(df) > 0:
            if "ts_event" in df.columns:
                df = df.sort_values("ts_event")
            return df
    except Exception as exc:
        log.debug("generate_order_fills_report() failed: %s", exc)
    return pd.DataFrame()
 def _extract_trades(fills_df: pd.DataFrame, initial_capital: float) -> list[dict]:
    """
    Pair fills into round-trip trades: buy → sell or sell → buy.
    Returns a list of trade dicts (capped at 500 for large backtests).
    """
    if fills_df.empty:
        return []
    trades: list[dict] = []
    open_positions: dict[str, dict] = {}  # instrument_id -> pending entry
    for _, fill in fills_df.iterrows():
        instrument = str(fill.get("instrument_id", ""))
        side = str(fill.get("order_side", "")).upper()
        qty = _money_to_float(fill.get("last_qty")) or 0.0
        price = _money_to_float(fill.get("last_px")) or 0.0
        ts_s = _ts_to_s(fill.get("ts_event"))
        rpnl = _money_to_float(fill.get("realized_pnl"))
        if rpnl is not None and rpnl != 0.0:
            # This fill closes a position — record as a completed trade
            entry = open_positions.pop(instrument, None)
            trade = {
                "instrument": instrument,
                "side": side,
                "quantity": round(qty, 8),
                "entry_price": round(entry["price"], 8) if entry else None,
                "exit_price": round(price, 8),
                "entry_time": entry["ts_s"] if entry else None,
                "exit_time": ts_s,
                "pnl": round(rpnl, 6),
            }
            trades.append(trade)
        else:
            # Opening fill — store for pairing
            open_positions[instrument] = {"price": price, "ts_s": ts_s, "side": side}
    return trades[:500]  # cap for large backtests
 def _compute_metrics(
    engine,
    venue_strs: list[str],
@@ -100,17 +176,18 @@ def _compute_metrics(
    all_bars: list,
 ) -> dict[str, Any]:
    """
-    Extract performance metrics from a completed BacktestEngine.
+    Extract structured performance metrics from a completed BacktestEngine.
    Returns dict with:
-        total_return   float  — fractional (0.15 = +15%)
+        summary      dict  — core metrics (total_return, sharpe, drawdown, etc.)
-        sharpe_ratio   float  — annualized; 0.0 if no trades or constant equity
+        statistics   dict  — extended stats (sortino, calmar, profit_factor, etc.)
-        max_drawdown   float  — max peak-to-trough as fraction (0.10 = 10% drawdown)
+        trades       list  — individual round-trip trades (capped at 500)
-        win_rate       float  — fraction of trades with positive realized PnL
+        equity_curve list[{timestamp: int_unix_s, equity: float}]
        trade_count    int
        equity_curve   list[{timestamp: int_unix_s, equity: float}]
    """
-    # Reconstruct equity curve from fills
+    fills_df = _extract_fills(engine)
    trades = _extract_trades(fills_df, initial_capital)
    # --- Equity curve reconstruction ---
    equity_points: list[dict] = []
    if all_bars:
        equity_points.append({
@@ -121,51 +198,24 @@ def _compute_metrics(
    running_equity = initial_capital
    trade_count = 0
    winning_trades = 0
    total_profit = 0.0
    total_loss = 0.0
-    try:
+    if not fills_df.empty:
        fills_df = engine.trader.generate_order_fills_report()
    except Exception as exc:
        log.debug("generate_order_fills_report() failed: %s", exc)
        fills_df = None
    if fills_df is not None and len(fills_df) > 0:
        # Sort by event time
        if "ts_event" in fills_df.columns:
            fills_df = fills_df.sort_values("ts_event")
        for _, fill in fills_df.iterrows():
-            rpnl = fill.get("realized_pnl") if hasattr(fill, "get") else None
+            rpnl = _money_to_float(fill.get("realized_pnl"))
-            if rpnl is None:
+            if rpnl is None or rpnl == 0.0:
                continue
-
+            ts_s = _ts_to_s(fill.get("ts_event"))
-            # Nautilus Money objects: str form is "15.32 USDT"
+            running_equity += rpnl
-            rpnl_float: float | None = None
+            trade_count += 1
-            try:
+            if rpnl > 0:
-                if hasattr(rpnl, "as_decimal"):
+                winning_trades += 1
-                    rpnl_float = float(rpnl.as_decimal())
+                total_profit += rpnl
-                elif rpnl is not None:
+            else:
-                    rpnl_str = str(rpnl).strip()
+                total_loss += abs(rpnl)
-                    if rpnl_str and rpnl_str.lower() not in ("none", "nan"):
+            if ts_s is not None:
-                        rpnl_float = float(rpnl_str.split()[0])
+                equity_points.append({"timestamp": ts_s, "equity": running_equity})
            except (ValueError, TypeError, IndexError):
                pass
            if rpnl_float is not None and rpnl_float != 0.0:
                ts_s: int | None = None
                raw_ts = fill.get("ts_event") if hasattr(fill, "get") else None
                if raw_ts is not None:
                    try:
                        ts_s = int(raw_ts) // 1_000_000_000
                    except (TypeError, ValueError):
                        pass
                running_equity += rpnl_float
                trade_count += 1
                if rpnl_float > 0:
                    winning_trades += 1
                if ts_s is not None:
                    equity_points.append({"timestamp": ts_s, "equity": running_equity})
    if all_bars:
        equity_points.append({
@@ -173,19 +223,16 @@ def _compute_metrics(
            "equity": running_equity,
        })
-    # Try to get actual final balance from the account (more accurate than fill reconstruction)
+    # Prefer definitive final balance from account cache
    try:
        from nautilus_trader.model.identifiers import Venue
        for venue_str in venue_strs:
            account = engine.cache.account_for_venue(Venue(venue_str))
            if account is None:
                continue
            # Sum all balances (quote currency is what we started with)
            for bal in account.balances().values():
-                total = getattr(bal, "total", None)
+                final_val = _money_to_float(getattr(bal, "total", None))
-                if total is not None:
+                if final_val is not None:
                    final_val = float(str(total).split()[0]) if not hasattr(total, "as_decimal") else float(total.as_decimal())
                    # Use the account balance as the definitive final equity
                    running_equity = final_val
                    if equity_points:
                        equity_points[-1]["equity"] = running_equity
@@ -193,36 +240,71 @@ def _compute_metrics(
    except Exception as exc:
        log.debug("Account balance extraction failed: %s", exc)
-    # Core metrics
+    # --- Core metrics ---
    total_return = (running_equity - initial_capital) / initial_capital if initial_capital else 0.0
    win_rate = winning_trades / trade_count if trade_count > 0 else 0.0
    profit_factor = (total_profit / total_loss) if total_loss > 0 else (float("inf") if total_profit > 0 else 0.0)
    # Determine bar duration for annualisation
    bar_duration_ns = 0.0
    if all_bars and len(all_bars) > 1:
        bar_duration_ns = (all_bars[-1].ts_event - all_bars[0].ts_event) / max(len(all_bars) - 1, 1)
    bars_per_year = (365 * 24 * 3600 * 1e9) / bar_duration_ns if bar_duration_ns > 0 else 0.0
    equity_series = pd.Series([p["equity"] for p in equity_points]) if len(equity_points) > 2 else pd.Series([initial_capital, running_equity])
    returns = equity_series.pct_change().dropna()
    # Sharpe ratio (annualized) from equity curve returns
    sharpe = 0.0
-    if len(equity_points) > 2 and all_bars and len(all_bars) > 1:
+    sortino = 0.0
-        equity_series = pd.Series([p["equity"] for p in equity_points])
+    if len(returns) > 1 and bars_per_year > 0:
-        returns = equity_series.pct_change().dropna()
+        mean_r = returns.mean()
-        if len(returns) > 1 and returns.std() > 0:
+        std_r = returns.std()
-            bar_duration_ns = (all_bars[-1].ts_event - all_bars[0].ts_event) / max(len(all_bars) - 1, 1)
+        if std_r > 0:
-            if bar_duration_ns > 0:
+            sharpe = float((mean_r / std_r) * (bars_per_year ** 0.5))
-                bars_per_year = (365 * 24 * 3600 * 1e9) / bar_duration_ns
+        downside = returns[returns < 0]
-                sharpe = float((returns.mean() / returns.std()) * (bars_per_year ** 0.5))
+        downside_std = downside.std() if len(downside) > 1 else 0.0
        if downside_std > 0:
            sortino = float((mean_r / downside_std) * (bars_per_year ** 0.5))
    # Max drawdown
    max_drawdown = 0.0
-    if len(equity_points) > 1:
+    if len(equity_series) > 1:
-        equity_arr = pd.Series([p["equity"] for p in equity_points])
+        rolling_max = equity_series.cummax()
-        rolling_max = equity_arr.cummax()
+        drawdowns = (equity_series - rolling_max) / rolling_max.replace(0, float("nan"))
        drawdowns = (equity_arr - rolling_max) / rolling_max.replace(0, float("nan"))
        max_drawdown = float(abs(drawdowns.min())) if len(drawdowns) > 0 else 0.0
    # Calmar ratio
    annualized_return = 0.0
    if bars_per_year > 0 and len(all_bars) > 1:
        years = (all_bars[-1].ts_event - all_bars[0].ts_event) / (365 * 24 * 3600 * 1e9)
        if years > 0:
            annualized_return = (running_equity / initial_capital) ** (1.0 / years) - 1 if initial_capital else 0.0
    calmar = annualized_return / max_drawdown if max_drawdown > 0 else 0.0
    # Average win / average loss
    avg_win = total_profit / winning_trades if winning_trades > 0 else 0.0
    avg_loss = total_loss / (trade_count - winning_trades) if (trade_count - winning_trades) > 0 else 0.0
    return {
-        "total_return":  round(total_return, 6),
+        "summary": {
-        "sharpe_ratio":  round(sharpe, 4),
+            "total_return":   round(total_return, 6),
-        "max_drawdown":  round(max_drawdown, 6),
+            "sharpe_ratio":   round(sharpe, 4),
-        "win_rate":      round(win_rate, 4),
+            "max_drawdown":   round(max_drawdown, 6),
-        "trade_count":   trade_count,
+            "win_rate":       round(win_rate, 4),
-        "equity_curve":  equity_points,
+            "trade_count":    trade_count,
            "total_trades":   len(trades),
        },
        "statistics": {
            "sortino_ratio":  round(sortino, 4),
            "calmar_ratio":   round(calmar, 4),
            "profit_factor":  round(profit_factor, 4) if profit_factor != float("inf") else None,
            "avg_win":        round(avg_win, 4),
            "avg_loss":       round(avg_loss, 4),
            "total_profit":   round(total_profit, 4),
            "total_loss":     round(total_loss, 4),
        },
        "trades":       trades,
        "equity_curve": equity_points,
    }
--- a/sandbox/dexorder/nautilus/data_adapter.py
+++ b/sandbox/dexorder/nautilus/data_adapter.py
@@ -13,6 +13,7 @@ make_instrument_from_metadata — instrument with best-effort precision
 from __future__ import annotations
 import logging
 from decimal import Decimal
 from typing import Optional
 import pandas as pd
@@ -71,8 +72,8 @@ def make_instrument(
    size_precision: int = 8,
    tick_size: Optional[float] = None,
    lot_size: Optional[float] = None,
-    maker_fee: float = 0.001,
+    maker_fee: float = 0.0,
-    taker_fee: float = 0.001,
+    taker_fee: float = 0.0,
    margin_init: float = 0.0,
    margin_maint: float = 0.0,
 ) -> CurrencyPair:
@@ -118,8 +119,8 @@ def make_instrument(
        min_price=None,
        margin_init=margin_init,
        margin_maint=margin_maint,
-        maker_fee=maker_fee,
+        maker_fee=Decimal(str(maker_fee)),
-        taker_fee=taker_fee,
+        taker_fee=Decimal(str(taker_fee)),
        ts_event=ts_now,
        ts_init=ts_now,
    )
@@ -154,8 +155,8 @@ def make_instrument_from_metadata(ticker: str) -> tuple[CurrencyPair, int, int]:
                size_precision=sp,
                tick_size=meta.tick_size,
                lot_size=meta.lot_size,
-                maker_fee=meta.maker_fee or 0.001,
+                maker_fee=meta.maker_fee or 0.0,
-                taker_fee=meta.taker_fee or 0.001,
+                taker_fee=meta.taker_fee or 0.0,
                margin_init=meta.margin_init or 0.0,
                margin_maint=meta.margin_maint or 0.0,
            )
--- a/sandbox/dexorder/ohlc_client.py
+++ b/sandbox/dexorder/ohlc_client.py
@@ -39,6 +39,7 @@ class OHLCClient:
        s3_endpoint: str = None,
        s3_access_key: str = None,
        s3_secret_key: str = None,
        s3_region: str = None,
    ):
        """
        Initialize OHLC client.
@@ -51,12 +52,14 @@ class OHLCClient:
            s3_endpoint: S3/MinIO endpoint URL (e.g., "http://localhost:9000")
            s3_access_key: S3/MinIO access key
            s3_secret_key: S3/MinIO secret key
            s3_region: S3/MinIO region (e.g., "us-east-1")
        """
        self.iceberg = IcebergClient(
            iceberg_catalog_uri, namespace,
            s3_endpoint=s3_endpoint,
            s3_access_key=s3_access_key,
            s3_secret_key=s3_secret_key,
            s3_region=s3_region,
        )
        self.history = HistoryClient(relay_endpoint, notification_endpoint)
        log.info("OHLCClient initialized")
@@ -122,7 +125,7 @@ class OHLCClient:
        if not missing_ranges:
            # All data exists in Iceberg
-            return self._forward_fill_gaps(df, period_seconds)
+            return df
        # Step 3: Request missing data for each range
        # For simplicity, request entire range (relay can merge adjacent requests)
--- a/sandbox/dexorder/strategy/init.py
+++ b/sandbox/dexorder/strategy/init.py
@@ -0,0 +1 @@
 # Strategy runtime package
--- a/sandbox/dexorder/strategy/db.py
+++ b/sandbox/dexorder/strategy/db.py
@@ -0,0 +1,361 @@
 """
 SQLite database for strategy execution state, trade logs, and backtest history.
 All data is stored under DATA_DIR/dexorder.db.
 Uses aiosqlite for async compatibility with the MCP server's event loop.
 """
 from __future__ import annotations
 import json
 import logging
 import time
 from pathlib import Path
 from typing import Any, Optional
 log = logging.getLogger(__name__)
 _SCHEMA = """
 CREATE TABLE IF NOT EXISTS strategies (
    name TEXT PRIMARY KEY,
    status TEXT NOT NULL DEFAULT 'stopped',
    git_rev TEXT,
    worktree_path TEXT,
    started_at REAL,
    stopped_at REAL,
    allocation REAL NOT NULL DEFAULT 0,
    paper INTEGER NOT NULL DEFAULT 1,
    feeds_json TEXT,
    config_json TEXT
 );
 CREATE TABLE IF NOT EXISTS strategy_state (
    name TEXT PRIMARY KEY,
    realized_pnl REAL NOT NULL DEFAULT 0,
    unrealized_pnl REAL NOT NULL DEFAULT 0,
    trade_count INTEGER NOT NULL DEFAULT 0,
    positions_json TEXT,
    updated_at REAL NOT NULL DEFAULT 0
 );
 CREATE TABLE IF NOT EXISTS trades (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    strategy_name TEXT NOT NULL,
    instrument TEXT NOT NULL,
    side TEXT NOT NULL,
    quantity REAL NOT NULL,
    entry_price REAL,
    exit_price REAL NOT NULL,
    entry_time REAL,
    exit_time REAL NOT NULL,
    pnl REAL NOT NULL,
    recorded_at REAL NOT NULL DEFAULT (unixepoch())
 );
 CREATE TABLE IF NOT EXISTS backtest_runs (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    strategy_name TEXT NOT NULL,
    ran_at REAL NOT NULL DEFAULT (unixepoch()),
    from_time REAL,
    to_time REAL,
    initial_capital REAL,
    feeds_json TEXT,
    summary_json TEXT,
    statistics_json TEXT,
    trades_json TEXT,
    equity_curve_json TEXT
 );
 CREATE TABLE IF NOT EXISTS strategy_events (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    strategy_name TEXT NOT NULL,
    event_type TEXT NOT NULL,
    payload_json TEXT,
    recorded_at REAL NOT NULL DEFAULT (unixepoch())
 );
 CREATE INDEX IF NOT EXISTS idx_trades_strategy ON trades(strategy_name);
 CREATE INDEX IF NOT EXISTS idx_backtest_strategy ON backtest_runs(strategy_name);
 CREATE INDEX IF NOT EXISTS idx_events_strategy ON strategy_events(strategy_name);
 """
 class StrategyDB:
    """Async SQLite interface for strategy persistence."""
    def __init__(self, db_path: Path):
        self.db_path = db_path
    async def initialize(self) -> None:
        """Create tables if they don't exist."""
        import aiosqlite
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        async with aiosqlite.connect(self.db_path) as db:
            await db.executescript(_SCHEMA)
            await db.commit()
        log.info("StrategyDB initialized at %s", self.db_path)
    # ------------------------------------------------------------------
    # Strategy lifecycle
    # ------------------------------------------------------------------
    async def upsert_strategy(
        self,
        name: str,
        status: str,
        allocation: float,
        paper: bool,
        feeds: list[dict],
        git_rev: Optional[str] = None,
        worktree_path: Optional[str] = None,
        config: Optional[dict] = None,
    ) -> None:
        import aiosqlite
        now = time.time()
        async with aiosqlite.connect(self.db_path) as db:
            await db.execute("""
                INSERT INTO strategies
                    (name, status, git_rev, worktree_path, started_at, allocation, paper, feeds_json, config_json)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
                ON CONFLICT(name) DO UPDATE SET
                    status=excluded.status,
                    git_rev=excluded.git_rev,
                    worktree_path=excluded.worktree_path,
                    started_at=excluded.started_at,
                    allocation=excluded.allocation,
                    paper=excluded.paper,
                    feeds_json=excluded.feeds_json,
                    config_json=excluded.config_json
            """, (
                name, status, git_rev, worktree_path, now,
                allocation, int(paper),
                json.dumps(feeds),
                json.dumps(config or {}),
            ))
            await db.commit()
    async def update_strategy_status(self, name: str, status: str) -> None:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            if status == "stopped":
                await db.execute(
                    "UPDATE strategies SET status=?, stopped_at=? WHERE name=?",
                    (status, time.time(), name)
                )
            else:
                await db.execute("UPDATE strategies SET status=? WHERE name=?", (status, name))
            await db.commit()
    async def get_strategy(self, name: str) -> Optional[dict]:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            db.row_factory = aiosqlite.Row
            async with db.execute("SELECT * FROM strategies WHERE name=?", (name,)) as cur:
                row = await cur.fetchone()
                return dict(row) if row else None
    async def get_all_strategies(self) -> list[dict]:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            db.row_factory = aiosqlite.Row
            async with db.execute("SELECT * FROM strategies ORDER BY started_at DESC") as cur:
                rows = await cur.fetchall()
                return [dict(r) for r in rows]
    async def get_running_strategies(self) -> list[dict]:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            db.row_factory = aiosqlite.Row
            async with db.execute(
                "SELECT * FROM strategies WHERE status='running' OR status='starting'",
            ) as cur:
                rows = await cur.fetchall()
                return [dict(r) for r in rows]
    # ------------------------------------------------------------------
    # PnL state
    # ------------------------------------------------------------------
    async def update_pnl_state(
        self,
        name: str,
        realized_pnl: float,
        unrealized_pnl: float,
        trade_count: int,
        positions: Optional[dict] = None,
    ) -> None:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            await db.execute("""
                INSERT INTO strategy_state
                    (name, realized_pnl, unrealized_pnl, trade_count, positions_json, updated_at)
                VALUES (?, ?, ?, ?, ?, ?)
                ON CONFLICT(name) DO UPDATE SET
                    realized_pnl=excluded.realized_pnl,
                    unrealized_pnl=excluded.unrealized_pnl,
                    trade_count=excluded.trade_count,
                    positions_json=excluded.positions_json,
                    updated_at=excluded.updated_at
            """, (name, realized_pnl, unrealized_pnl, trade_count,
                  json.dumps(positions or {}), time.time()))
            await db.commit()
    async def get_pnl_state(self, name: str) -> Optional[dict]:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            db.row_factory = aiosqlite.Row
            async with db.execute("SELECT * FROM strategy_state WHERE name=?", (name,)) as cur:
                row = await cur.fetchone()
                return dict(row) if row else None
    # ------------------------------------------------------------------
    # Trades
    # ------------------------------------------------------------------
    async def insert_trade(self, strategy_name: str, trade: dict) -> None:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            await db.execute("""
                INSERT INTO trades
                    (strategy_name, instrument, side, quantity, entry_price,
                     exit_price, entry_time, exit_time, pnl)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                strategy_name,
                trade.get("instrument", ""),
                trade.get("side", ""),
                trade.get("quantity", 0),
                trade.get("entry_price"),
                trade.get("exit_price", 0),
                trade.get("entry_time"),
                trade.get("exit_time", time.time()),
                trade.get("pnl", 0),
            ))
            await db.commit()
    async def get_trades(self, strategy_name: str, limit: int = 200) -> list[dict]:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            db.row_factory = aiosqlite.Row
            async with db.execute(
                "SELECT * FROM trades WHERE strategy_name=? ORDER BY exit_time DESC LIMIT ?",
                (strategy_name, limit),
            ) as cur:
                rows = await cur.fetchall()
                return [dict(r) for r in rows]
    # ------------------------------------------------------------------
    # Backtest runs
    # ------------------------------------------------------------------
    async def insert_backtest(
        self,
        strategy_name: str,
        from_time: Any,
        to_time: Any,
        initial_capital: float,
        feeds: list[dict],
        summary: dict,
        statistics: dict,
        trades: list[dict],
        equity_curve: list[dict],
    ) -> int:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            cur = await db.execute("""
                INSERT INTO backtest_runs
                    (strategy_name, from_time, to_time, initial_capital, feeds_json,
                     summary_json, statistics_json, trades_json, equity_curve_json)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                strategy_name,
                float(from_time) if from_time else None,
                float(to_time) if to_time else None,
                initial_capital,
                json.dumps(feeds),
                json.dumps(summary),
                json.dumps(statistics),
                json.dumps(trades[:500]),  # cap
                json.dumps(equity_curve),
            ))
            await db.commit()
            return cur.lastrowid
    async def get_backtests(self, strategy_name: str, limit: int = 10) -> list[dict]:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            db.row_factory = aiosqlite.Row
            async with db.execute(
                "SELECT * FROM backtest_runs WHERE strategy_name=? ORDER BY ran_at DESC LIMIT ?",
                (strategy_name, limit),
            ) as cur:
                rows = await cur.fetchall()
                result = []
                for r in rows:
                    d = dict(r)
                    for key in ("feeds_json", "summary_json", "statistics_json",
                                "trades_json", "equity_curve_json"):
                        if d.get(key):
                            plain = key.replace("_json", "")
                            d[plain] = json.loads(d.pop(key))
                        else:
                            d.pop(key, None)
                    result.append(d)
                return result
    # ------------------------------------------------------------------
    # Events
    # ------------------------------------------------------------------
    async def insert_event(self, strategy_name: str, event_type: str, payload: dict) -> None:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            await db.execute(
                "INSERT INTO strategy_events (strategy_name, event_type, payload_json) VALUES (?, ?, ?)",
                (strategy_name, event_type, json.dumps(payload)),
            )
            await db.commit()
    async def get_events(
        self,
        strategy_name: str,
        event_type: Optional[str] = None,
        limit: int = 100,
    ) -> list[dict]:
        import aiosqlite
        async with aiosqlite.connect(self.db_path) as db:
            db.row_factory = aiosqlite.Row
            if event_type:
                async with db.execute(
                    "SELECT * FROM strategy_events WHERE strategy_name=? AND event_type=? "
                    "ORDER BY recorded_at DESC LIMIT ?",
                    (strategy_name, event_type, limit),
                ) as cur:
                    rows = await cur.fetchall()
            else:
                async with db.execute(
                    "SELECT * FROM strategy_events WHERE strategy_name=? "
                    "ORDER BY recorded_at DESC LIMIT ?",
                    (strategy_name, limit),
                ) as cur:
                    rows = await cur.fetchall()
            result = []
            for r in rows:
                d = dict(r)
                if d.get("payload_json"):
                    d["payload"] = json.loads(d.pop("payload_json"))
                result.append(d)
            return result
 # Singleton
 _db: Optional[StrategyDB] = None
 def get_strategy_db(data_dir: Optional[Path] = None) -> StrategyDB:
    global _db
    if _db is None:
        if data_dir is None:
            import os
            data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
        _db = StrategyDB(data_dir / "dexorder.db")
    return _db
--- a/sandbox/dexorder/strategy/event_bridge.py
+++ b/sandbox/dexorder/strategy/event_bridge.py
@@ -0,0 +1,152 @@
 """
 StrategyEventBridge — receives internal strategy events from subprocesses
 and forwards them to the user-facing EventPublisher.
 Architecture:
    Strategy subprocess ──PUSH──> [IPC socket] ──PULL──> StrategyEventBridge
                                                            └─> EventPublisher
                                                                  ├── XPUB (informational)
                                                                  └── DEALER (critical)
 """
 from __future__ import annotations
 import asyncio
 import logging
 import time
 from typing import Optional
 import zmq
 import zmq.asyncio
 from .events import StrategyEvent, StrategyEventType, IPC_ENDPOINT
 log = logging.getLogger(__name__)
 # How long without a heartbeat before a strategy is considered dead (seconds)
 HEARTBEAT_TIMEOUT = 60.0
 class StrategyEventBridge:
    """
    Binds a ZMQ PULL socket and relays strategy events to EventPublisher.
    Also monitors heartbeats to detect crashed strategy subprocesses.
    """
    def __init__(self, event_publisher, strategy_lifecycle=None):
        """
        Args:
            event_publisher: dexorder.events.publisher.EventPublisher instance
            strategy_lifecycle: StrategyLifecycleManager (optional) for marking crashed strategies
        """
        self._publisher = event_publisher
        self._lifecycle = strategy_lifecycle
        self._ctx: Optional[zmq.asyncio.Context] = None
        self._socket: Optional[zmq.asyncio.Socket] = None
        self._task: Optional[asyncio.Task] = None
        self._heartbeat_task: Optional[asyncio.Task] = None
        self._last_heartbeat: dict[str, float] = {}  # strategy_name -> timestamp
        self._running = False
    async def start(self) -> None:
        """Bind PULL socket and start receive loop."""
        self._ctx = zmq.asyncio.Context.instance()
        self._socket = self._ctx.socket(zmq.PULL)
        self._socket.bind(IPC_ENDPOINT)
        self._running = True
        self._task = asyncio.create_task(self._receive_loop())
        self._heartbeat_task = asyncio.create_task(self._heartbeat_monitor())
        log.info("StrategyEventBridge started on %s", IPC_ENDPOINT)
    async def stop(self) -> None:
        """Stop receive loop and close socket."""
        self._running = False
        for task in [self._task, self._heartbeat_task]:
            if task:
                task.cancel()
                try:
                    await task
                except asyncio.CancelledError:
                    pass
        if self._socket:
            self._socket.close()
        log.info("StrategyEventBridge stopped")
    def notify_strategy_started(self, strategy_name: str) -> None:
        """Called by lifecycle manager when a strategy subprocess starts."""
        self._last_heartbeat[strategy_name] = time.time()
    def notify_strategy_stopped(self, strategy_name: str) -> None:
        """Called by lifecycle manager when a strategy is deactivated."""
        self._last_heartbeat.pop(strategy_name, None)
    async def _receive_loop(self) -> None:
        while self._running:
            try:
                raw = await asyncio.wait_for(self._socket.recv(), timeout=1.0)
                event = StrategyEvent.deserialize(raw)
                await self._handle_event(event)
            except asyncio.TimeoutError:
                continue
            except asyncio.CancelledError:
                raise
            except Exception as e:
                log.error("Error receiving strategy event: %s", e)
    async def _handle_event(self, event: StrategyEvent) -> None:
        """Translate internal StrategyEvent to UserEvent and publish."""
        from dexorder.events.types import EventType, Priority
        from dexorder.events.publisher import UserEvent, DeliverySpec
        name = event.strategy_name
        if event.event_type == StrategyEventType.HEARTBEAT:
            self._last_heartbeat[name] = time.time()
            return  # heartbeats are not forwarded to the user
        # Map to UserEvent types
        type_map = {
            StrategyEventType.STARTED:          (EventType.STRATEGY_STARTED,   Priority.INFORMATIONAL),
            StrategyEventType.STOPPED:          (EventType.STRATEGY_STOPPED,   Priority.INFORMATIONAL),
            StrategyEventType.ORDER_SUBMITTED:  (EventType.ORDER_PLACED,       Priority.NORMAL),
            StrategyEventType.ORDER_FILLED:     (EventType.ORDER_FILLED,       Priority.CRITICAL),
            StrategyEventType.POSITION_UPDATE:  (EventType.POSITION_UPDATED,   Priority.INFORMATIONAL),
            StrategyEventType.PNL_UPDATE:       (EventType.STRATEGY_LOG,       Priority.INFORMATIONAL),
            StrategyEventType.ERROR:            (EventType.STRATEGY_ERROR,     Priority.CRITICAL),
            StrategyEventType.LOG:              (EventType.STRATEGY_LOG,       Priority.INFORMATIONAL),
        }
        et, priority = type_map.get(event.event_type, (EventType.STRATEGY_LOG, Priority.INFORMATIONAL))
        payload = {"strategy_name": name, **event.payload}
        delivery = (
            DeliverySpec.critical() if priority == Priority.CRITICAL
            else DeliverySpec.informational()
        )
        try:
            from dexorder.events.types import UserEvent as UE
            await self._publisher.publish(UE(
                event_type=et,
                payload=payload,
                delivery=delivery,
            ))
        except Exception as e:
            log.error("Failed to publish strategy event %s: %s", event.event_type, e)
    async def _heartbeat_monitor(self) -> None:
        """Periodically check for strategies that stopped sending heartbeats."""
        while self._running:
            try:
                await asyncio.sleep(30)
                now = time.time()
                for name, last_seen in list(self._last_heartbeat.items()):
                    if now - last_seen > HEARTBEAT_TIMEOUT:
                        log.warning("Strategy '%s' missed heartbeat, marking as crashed", name)
                        self._last_heartbeat.pop(name, None)
                        if self._lifecycle:
                            await self._lifecycle.mark_crashed(name)
            except asyncio.CancelledError:
                raise
            except Exception as e:
                log.error("Heartbeat monitor error: %s", e)
--- a/sandbox/dexorder/strategy/events.py
+++ b/sandbox/dexorder/strategy/events.py
@@ -0,0 +1,61 @@
 """
 Internal strategy event types for subprocess → main-process communication.
 Strategy subprocesses push StrategyEvents via ZMQ PUSH socket.
 The main process's StrategyEventBridge receives them via PULL and forwards
 them to the user-facing EventPublisher (dexorder/events/publisher.py).
 """
 from __future__ import annotations
 import json
 import time
 import uuid
 from dataclasses import dataclass, field
 from enum import IntEnum
 class StrategyEventType(IntEnum):
    """Internal event types produced by strategy subprocesses."""
    STARTED        = 1
    STOPPED        = 2
    HEARTBEAT      = 3
    ORDER_SUBMITTED = 10
    ORDER_FILLED   = 11
    POSITION_UPDATE = 20
    PNL_UPDATE     = 21
    ERROR          = 30
    LOG            = 31
@dataclass
 class StrategyEvent:
    """Internal event envelope sent from strategy subprocess to main process."""
    event_type: StrategyEventType
    strategy_name: str
    payload: dict
    timestamp: float = field(default_factory=time.time)
    event_id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
    def serialize(self) -> bytes:
        return json.dumps({
            "event_type": int(self.event_type),
            "strategy_name": self.strategy_name,
            "payload": self.payload,
            "timestamp": self.timestamp,
            "event_id": self.event_id,
        }).encode()
    @classmethod
    def deserialize(cls, data: bytes) -> "StrategyEvent":
        d = json.loads(data.decode())
        return cls(
            event_type=StrategyEventType(d["event_type"]),
            strategy_name=d["strategy_name"],
            payload=d.get("payload", {}),
            timestamp=d.get("timestamp", time.time()),
            event_id=d.get("event_id", ""),
        )
 # IPC endpoint used for strategy subprocess → main process communication
 IPC_ENDPOINT = "ipc:///tmp/dexorder-strategy-events.sock"
--- a/sandbox/dexorder/strategy/lifecycle.py
+++ b/sandbox/dexorder/strategy/lifecycle.py
@@ -0,0 +1,322 @@
 """
 StrategyLifecycleManager — manages running strategy subprocesses.
 Responsibilities:
 - Starting strategy subprocesses from git worktrees
 - Stopping subprocesses on deactivation
 - Persisting state to SQLite for crash recovery
 - Registering strategies as LifecycleManager triggers (prevents idle shutdown)
 - Enforcing max concurrent strategy limit
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import threading
 import time
 from pathlib import Path
 from typing import Optional
 log = logging.getLogger(__name__)
 MAX_CONCURRENT_STRATEGIES = 5
 DEFAULT_POLL_INTERVAL = 60  # seconds between bar checks
 class StrategyLifecycleManager:
    def __init__(self, data_dir: Path, event_bridge=None, lifecycle_manager=None):
        self.data_dir = data_dir
        self.worktrees_dir = data_dir / "worktrees"
        self.configs_dir = data_dir / "strategy_configs"
        self._bridge = event_bridge
        self._lifecycle = lifecycle_manager  # dexorder LifecycleManager
        self._runners: dict[str, tuple[threading.Thread, threading.Event]] = {}  # name -> (thread, stop_event)
        self._db: Optional["StrategyDB"] = None
    async def initialize(self) -> None:
        """Initialize DB and prune stale worktrees."""
        from dexorder.strategy.db import get_strategy_db
        from dexorder.tools.python_tools import get_category_manager
        self._db = get_strategy_db(self.data_dir)
        await self._db.initialize()
        self.worktrees_dir.mkdir(parents=True, exist_ok=True)
        self.configs_dir.mkdir(parents=True, exist_ok=True)
        # Prune any git worktrees that are no longer registered
        try:
            mgr = get_category_manager(self.data_dir)
            mgr.git.prune_worktrees()
        except Exception as e:
            log.warning("git worktree prune failed: %s", e)
    async def resume_running(self) -> None:
        """On container restart, re-launch strategies that were 'running' at shutdown."""
        if self._db is None:
            return
        try:
            running = await self._db.get_running_strategies()
            for row in running:
                name = row["name"]
                log.info("Resuming strategy '%s' after container restart", name)
                feeds = json.loads(row.get("feeds_json") or "[]")
                await self.activate(
                    strategy_name=name,
                    feeds=feeds,
                    allocation=row.get("allocation", 10_000.0),
                    paper=bool(row.get("paper", 1)),
                    _resume=True,
                )
        except Exception as e:
            log.error("Failed to resume strategies: %s", e)
    # ------------------------------------------------------------------
    # Activate / Deactivate
    # ------------------------------------------------------------------
    async def activate(
        self,
        strategy_name: str,
        feeds: list[dict],
        allocation: float,
        paper: bool = True,
        git_revision: str = "HEAD",
        _resume: bool = False,
    ) -> dict:
        """
        Activate a strategy.
        Creates a git worktree at the given revision, writes a config file,
        and spawns a subprocess running runner.py.
        Returns a dict with status and details.
        """
        if strategy_name in self._runners:
            return {"error": f"Strategy '{strategy_name}' is already running"}
        if len(self._runners) >= MAX_CONCURRENT_STRATEGIES:
            return {
                "error": f"Maximum concurrent strategies ({MAX_CONCURRENT_STRATEGIES}) reached. "
                         "Deactivate a running strategy first."
            }
        # Build worktree
        from dexorder.tools.python_tools import get_category_manager, sanitize_name
        mgr = get_category_manager(self.data_dir)
        safe_name = sanitize_name(strategy_name)
        impl_path = self.data_dir / "src" / "strategy" / safe_name / "implementation.py"
        if not impl_path.exists():
            return {"error": f"Strategy '{strategy_name}' not found at {impl_path}"}
        try:
            short_hash = mgr.git.head_short_hash() if git_revision == "HEAD" else git_revision[:7]
            worktree_name = f"{safe_name}_{short_hash}"
            worktree_path = self.worktrees_dir / worktree_name
            if not worktree_path.exists():
                actual_hash = mgr.git.create_worktree(worktree_path, git_revision)
            else:
                actual_hash = short_hash
        except Exception as e:
            return {"error": f"Failed to create git worktree: {e}"}
        worktree_impl = worktree_path / "src" / "strategy" / safe_name / "implementation.py"
        if not worktree_impl.exists():
            # Fall back to live impl (worktree may not include subdirs on first use)
            worktree_impl = impl_path
        # Feed configs as list of [ticker, period_seconds]
        feed_configs = [[f.get("symbol", ""), int(f.get("period_seconds", 3600))] for f in feeds]
        # Write runner config to a temp file under DATA_DIR
        runner_config = {
            "strategy_name": strategy_name,
            "impl_path": str(worktree_impl),
            "feed_configs": feed_configs,
            "allocation": allocation,
            "ipc_endpoint": "ipc:///tmp/dexorder-strategy-events.sock",
            "data_dir": str(self.data_dir),
            "poll_interval": DEFAULT_POLL_INTERVAL,
        }
        config_file = self.configs_dir / f"{safe_name}.json"
        config_file.write_text(json.dumps(runner_config, indent=2))
        # Launch strategy in a daemon thread
        try:
            from dexorder.strategy.runner import run_thread
            stop_event = threading.Event()
            thread = threading.Thread(
                target=run_thread,
                args=(runner_config, stop_event),
                daemon=True,
                name=f"strategy-{safe_name}",
            )
            thread.start()
        except Exception as e:
            return {"error": f"Failed to start strategy thread: {e}"}
        self._runners[strategy_name] = (thread, stop_event)
        # Register as lifecycle trigger
        if self._lifecycle:
            self._lifecycle.add_trigger(f"strategy:{strategy_name}")
        # Notify event bridge
        if self._bridge:
            self._bridge.notify_strategy_started(strategy_name)
        # Persist to DB
        if self._db:
            await self._db.upsert_strategy(
                name=strategy_name,
                status="running",
                allocation=allocation,
                paper=paper,
                feeds=feeds,
                git_rev=actual_hash,
                worktree_path=str(worktree_path),
                config=runner_config,
            )
        log.info("Strategy '%s' activated (thread=%d, rev=%s)", strategy_name, thread.ident, actual_hash)
        return {
            "status": "activated",
            "strategy_name": strategy_name,
            "paper": paper,
            "allocation": allocation,
            "git_revision": actual_hash,
            "thread_id": thread.ident,
        }
    async def deactivate(self, strategy_name: str) -> dict:
        """Stop a running strategy and clean up its worktree."""
        entry = self._runners.pop(strategy_name, None)
        if entry is None:
            return {"error": f"Strategy '{strategy_name}' is not running"}
        thread, stop_event = entry
        # Signal the runner to stop and wait for the thread to exit
        stop_event.set()
        await asyncio.get_event_loop().run_in_executor(
            None, lambda: thread.join(timeout=15)
        )
        if thread.is_alive():
            log.warning("Strategy '%s' thread did not exit within timeout", strategy_name)
        # Remove lifecycle trigger
        if self._lifecycle:
            self._lifecycle.remove_trigger(f"strategy:{strategy_name}")
        # Notify bridge
        if self._bridge:
            self._bridge.notify_strategy_stopped(strategy_name)
        # Get final PnL from DB
        final_pnl = 0.0
        if self._db:
            state = await self._db.get_pnl_state(strategy_name)
            if state:
                final_pnl = state.get("realized_pnl", 0.0)
            await self._db.update_strategy_status(strategy_name, "stopped")
        # Clean up worktree
        await self._cleanup_worktree(strategy_name)
        log.info("Strategy '%s' deactivated, final_pnl=%.4f", strategy_name, final_pnl)
        return {
            "status": "deactivated",
            "strategy_name": strategy_name,
            "final_pnl": final_pnl,
        }
    async def mark_crashed(self, strategy_name: str) -> None:
        """Mark a strategy as crashed (called by heartbeat monitor)."""
        self._runners.pop(strategy_name, None)
        if self._lifecycle:
            self._lifecycle.remove_trigger(f"strategy:{strategy_name}")
        if self._db:
            await self._db.update_strategy_status(strategy_name, "error")
        log.error("Strategy '%s' marked as crashed (heartbeat timeout)", strategy_name)
    async def update_pnl(self, strategy_name: str, payload: dict) -> None:
        """Called by event bridge when a PNL_UPDATE event arrives."""
        if self._db:
            await self._db.update_pnl_state(
                name=strategy_name,
                realized_pnl=payload.get("realized_pnl", 0.0),
                unrealized_pnl=payload.get("unrealized_pnl", 0.0),
                trade_count=payload.get("trade_count", 0),
            )
    # ------------------------------------------------------------------
    # Listing
    # ------------------------------------------------------------------
    async def list_active(self) -> list[dict]:
        """Return currently running strategies with PnL state."""
        if self._db is None:
            return []
        strategies = await self._db.get_running_strategies()
        result = []
        for s in strategies:
            name = s["name"]
            state = await self._db.get_pnl_state(name)
            entry = {
                "strategy_name": name,
                "status": s.get("status", "unknown"),
                "paper": bool(s.get("paper", 1)),
                "allocation": s.get("allocation", 0),
                "git_revision": s.get("git_rev"),
                "started_at": s.get("started_at"),
                "feeds": json.loads(s.get("feeds_json") or "[]"),
                "realized_pnl": state.get("realized_pnl", 0.0) if state else 0.0,
                "unrealized_pnl": state.get("unrealized_pnl", 0.0) if state else 0.0,
                "trade_count": state.get("trade_count", 0) if state else 0,
            }
            result.append(entry)
        return result
    # ------------------------------------------------------------------
    # Shutdown
    # ------------------------------------------------------------------
    async def shutdown(self) -> None:
        """Stop all running strategies on container shutdown."""
        names = list(self._runners.keys())
        for name in names:
            await self.deactivate(name)
    # ------------------------------------------------------------------
    # Internal
    # ------------------------------------------------------------------
    async def _cleanup_worktree(self, strategy_name: str) -> None:
        if self._db is None:
            return
        try:
            row = await self._db.get_strategy(strategy_name)
            wt = row.get("worktree_path") if row else None
            if wt:
                from dexorder.tools.python_tools import get_category_manager
                mgr = get_category_manager(self.data_dir)
                mgr.git.remove_worktree(Path(wt))
        except Exception as e:
            log.warning("Worktree cleanup failed for '%s': %s", strategy_name, e)
 # Singleton
 _lifecycle_manager: Optional[StrategyLifecycleManager] = None
 def get_strategy_lifecycle(data_dir: Optional[Path] = None) -> StrategyLifecycleManager:
    global _lifecycle_manager
    if _lifecycle_manager is None:
        if data_dir is None:
            import os
            data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
        _lifecycle_manager = StrategyLifecycleManager(data_dir)
    return _lifecycle_manager
--- a/sandbox/dexorder/strategy/paper_account.py
+++ b/sandbox/dexorder/strategy/paper_account.py
@@ -0,0 +1,196 @@
 """
 Lightweight paper trading account for strategy subprocesses.
 Simulates order execution at bar-close prices without requiring Nautilus TradingNode.
 Tracks positions, PnL, and trade history. All amounts are in the quote currency.
 """
 from __future__ import annotations
 import logging
 import time
 from dataclasses import dataclass, field
 from typing import Optional
 log = logging.getLogger(__name__)
@dataclass
 class Position:
    """An open position."""
    instrument: str
    side: str          # "long" or "short"
    quantity: float
    entry_price: float
    entry_time: float  # Unix timestamp
@dataclass
 class Trade:
    """A completed round-trip trade."""
    instrument: str
    side: str          # direction of the entry
    quantity: float
    entry_price: float
    exit_price: float
    entry_time: float
    exit_time: float
    pnl: float
 class PaperAccount:
    """
    Simulates a cash paper account for a single strategy.
    Positions are opened/closed by calling buy(), sell(), and flatten().
    Fills execute at the provided price (e.g. bar close).
    """
    def __init__(self, initial_capital: float, feed_key: Optional[str] = None):
        self.initial_capital = initial_capital
        self.balance = initial_capital
        self._positions: dict[str, Position] = {}  # feed_key → Position
        self._trades: list[Trade] = []
        self._default_feed_key = feed_key
    # ------------------------------------------------------------------
    # Order API (mirrors PandasStrategy's order API)
    # ------------------------------------------------------------------
    def buy(self, quantity: float, price: float, feed_key: Optional[str] = None) -> None:
        """Open a long or close a short at price."""
        fk = feed_key or self._default_feed_key or "default"
        existing = self._positions.get(fk)
        if existing and existing.side == "short":
            # Close short
            pnl = (existing.entry_price - price) * existing.quantity
            self._close_position(fk, price, pnl)
        elif not existing:
            # Open long
            cost = price * quantity
            if cost > self.balance:
                quantity = self.balance / price  # size down to available capital
            if quantity > 0:
                self._positions[fk] = Position(
                    instrument=fk, side="long", quantity=quantity,
                    entry_price=price, entry_time=time.time(),
                )
                log.debug("Paper BUY %.6f @ %.2f (%s)", quantity, price, fk)
    def sell(self, quantity: float, price: float, feed_key: Optional[str] = None) -> None:
        """Open a short or close a long at price."""
        fk = feed_key or self._default_feed_key or "default"
        existing = self._positions.get(fk)
        if existing and existing.side == "long":
            # Close long
            pnl = (price - existing.entry_price) * existing.quantity
            self._close_position(fk, price, pnl)
        elif not existing:
            # Open short (using margin — simplified: require 2x capital)
            cost = price * quantity * 2
            if cost > self.balance:
                quantity = self.balance / (price * 2)
            if quantity > 0:
                self._positions[fk] = Position(
                    instrument=fk, side="short", quantity=quantity,
                    entry_price=price, entry_time=time.time(),
                )
                log.debug("Paper SELL %.6f @ %.2f (%s)", quantity, price, fk)
    def flatten(self, price: float, feed_key: Optional[str] = None) -> None:
        """Close any open position at price."""
        if feed_key:
            keys = [feed_key]
        else:
            keys = list(self._positions.keys())
        for fk in keys:
            pos = self._positions.get(fk)
            if pos is None:
                continue
            if pos.side == "long":
                pnl = (price - pos.entry_price) * pos.quantity
            else:
                pnl = (pos.entry_price - price) * pos.quantity
            self._close_position(fk, price, pnl)
    # ------------------------------------------------------------------
    # Reporting
    # ------------------------------------------------------------------
    def unrealized_pnl(self, current_prices: dict[str, float]) -> float:
        """Compute unrealized PnL using current prices."""
        total = 0.0
        for fk, pos in self._positions.items():
            price = current_prices.get(fk)
            if price is None:
                continue
            if pos.side == "long":
                total += (price - pos.entry_price) * pos.quantity
            else:
                total += (pos.entry_price - price) * pos.quantity
        return total
    def realized_pnl(self) -> float:
        return sum(t.pnl for t in self._trades)
    def total_pnl(self, current_prices: dict[str, float] | None = None) -> float:
        rpnl = self.realized_pnl()
        upnl = self.unrealized_pnl(current_prices) if current_prices else 0.0
        return rpnl + upnl
    def trade_count(self) -> int:
        return len(self._trades)
    def win_rate(self) -> float:
        if not self._trades:
            return 0.0
        wins = sum(1 for t in self._trades if t.pnl > 0)
        return wins / len(self._trades)
    def positions(self) -> dict[str, dict]:
        return {
            fk: {
                "side": p.side,
                "quantity": p.quantity,
                "entry_price": p.entry_price,
            }
            for fk, p in self._positions.items()
        }
    def recent_trades(self, n: int = 50) -> list[dict]:
        return [
            {
                "instrument": t.instrument,
                "side": t.side,
                "quantity": round(t.quantity, 8),
                "entry_price": round(t.entry_price, 8),
                "exit_price": round(t.exit_price, 8),
                "entry_time": t.entry_time,
                "exit_time": t.exit_time,
                "pnl": round(t.pnl, 6),
            }
            for t in self._trades[-n:]
        ]
    # ------------------------------------------------------------------
    # Internal
    # ------------------------------------------------------------------
    def _close_position(self, fk: str, price: float, pnl: float) -> None:
        pos = self._positions.pop(fk, None)
        if pos is None:
            return
        self.balance += pnl
        self._trades.append(Trade(
            instrument=fk,
            side=pos.side,
            quantity=pos.quantity,
            entry_price=pos.entry_price,
            exit_price=price,
            entry_time=pos.entry_time,
            exit_time=time.time(),
            pnl=pnl,
        ))
        log.debug("Paper trade closed: pnl=%.4f balance=%.2f (%s)", pnl, self.balance, fk)
--- a/sandbox/dexorder/strategy/runner.py
+++ b/sandbox/dexorder/strategy/runner.py
@@ -0,0 +1,395 @@
 #!/usr/bin/env python3
 """
 Strategy subprocess runner.
 Loads a PandasStrategy from a git worktree path, subscribes to live bar data
 (polling DataAPI), runs the paper trading loop, and pushes events to the main
 MCP process via ZMQ PUSH.
 Usage:
    python -m dexorder.strategy.runner --config <json_config_path>
 Config JSON:
    {
        "strategy_name": "My RSI Strategy",
        "impl_path": "/app/data/worktrees/my_rsi_strategy_abc1234/strategy/my_rsi_strategy/implementation.py",
        "feed_configs": [["BTC/USDT.BINANCE", 3600]],
        "allocation": 5000.0,
        "ipc_endpoint": "ipc:///tmp/dexorder-strategy-events.sock",
        "data_dir": "/app/data",
        "poll_interval": 60
    }
 """
 from __future__ import annotations
 import argparse
 import asyncio
 import json
 import logging
 import os
 import sys
 import threading
 import time
 from pathlib import Path
 # Ensure the worktree's parent (which contains dexorder package) is on the path.
 # Also ensure the original dexorder package is importable.
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
 log = logging.getLogger(__name__)
 class StrategyRunner:
    """Runs a PandasStrategy in paper trading mode using DataAPI polling."""
    def __init__(self, config: dict, stop_event: threading.Event | None = None):
        self.strategy_name = config["strategy_name"]
        self.impl_path = Path(config["impl_path"])
        self.feed_configs: list[tuple[str, int]] = [
            (f[0], int(f[1])) for f in config["feed_configs"]
        ]
        self.allocation = float(config.get("allocation", 10_000.0))
        self.ipc_endpoint = config.get("ipc_endpoint", "ipc:///tmp/dexorder-strategy-events.sock")
        self.data_dir = Path(config.get("data_dir", "/app/data"))
        self.poll_interval = int(config.get("poll_interval", 60))  # seconds
        self._stop_event = stop_event or threading.Event()
        self._running = False
        self._push_socket = None
        self._strategy = None
        self._paper: "PaperAccount | None" = None
        self._last_timestamps: dict[str, int] = {}  # feed_key -> last seen timestamp_ns
    async def run(self) -> None:
        """Main async entry point."""
        self._setup_zmq()
        await self._push_event("STARTED", {})
        try:
            await self._setup_strategy()
            await self._trading_loop()
        except asyncio.CancelledError:
            pass
        except Exception as e:
            log.exception("Strategy runner fatal error")
            await self._push_event("ERROR", {"message": str(e)})
        finally:
            await self._push_event("STOPPED", {
                "pnl": self._paper.realized_pnl() if self._paper else 0.0,
                "trade_count": self._paper.trade_count() if self._paper else 0,
            })
            self._cleanup_zmq()
    # ------------------------------------------------------------------
    # Setup
    # ------------------------------------------------------------------
    def _setup_zmq(self) -> None:
        import zmq
        ctx = zmq.Context.instance()
        self._push_socket = ctx.socket(zmq.PUSH)
        self._push_socket.connect(self.ipc_endpoint)
        log.info("Connected PUSH socket to %s", self.ipc_endpoint)
    def _cleanup_zmq(self) -> None:
        if self._push_socket:
            self._push_socket.close()
    async def _setup_strategy(self) -> None:
        from dexorder.nautilus.backtest_runner import _load_strategy_class, _setup_custom_indicators
        from dexorder.nautilus.pandas_strategy import PandasStrategyConfig, make_feed_key
        from dexorder.strategy.paper_account import PaperAccount
        # Register custom indicators
        try:
            _setup_custom_indicators(self.data_dir)
        except Exception as e:
            log.warning("Custom indicator setup failed: %s", e)
        # Load strategy class from worktree impl path
        strategy_class = _load_strategy_class(self.impl_path)
        log.info("Loaded strategy class: %s", strategy_class.__name__)
        feed_keys = tuple(make_feed_key(t, p) for t, p in self.feed_configs)
        config = PandasStrategyConfig(
            strategy_id=f"{strategy_class.__name__}-PAPER",
            feed_keys=feed_keys,
            initial_capital=self.allocation,
        )
        self._strategy = strategy_class(config=config)
        self._paper = PaperAccount(self.allocation, feed_keys[0] if feed_keys else None)
        # Wire paper account into strategy's order methods
        self._wire_paper_account(feed_keys)
        log.info("Strategy '%s' initialized with %d feed(s)", self.strategy_name, len(feed_keys))
    def _wire_paper_account(self, feed_keys: tuple) -> None:
        """Replace strategy's order methods with paper account calls."""
        paper = self._paper
        from dexorder.nautilus.pandas_strategy import make_feed_key
        def paper_buy(quantity, feed_key=None):
            fk = feed_key or (feed_keys[0] if feed_keys else "default")
            # Get current close price from last seen bars
            price = self._current_price(fk)
            if price:
                paper.buy(quantity, price, fk)
                asyncio.create_task(self._push_event("ORDER_FILLED", {
                    "side": "buy", "quantity": quantity,
                    "price": price, "feed_key": fk,
                    "pnl": paper.realized_pnl(),
                }))
        def paper_sell(quantity, feed_key=None):
            fk = feed_key or (feed_keys[0] if feed_keys else "default")
            price = self._current_price(fk)
            if price:
                paper.sell(quantity, price, fk)
                asyncio.create_task(self._push_event("ORDER_FILLED", {
                    "side": "sell", "quantity": quantity,
                    "price": price, "feed_key": fk,
                    "pnl": paper.realized_pnl(),
                }))
        def paper_flatten(feed_key=None):
            if feed_key:
                fk_list = [feed_key]
            else:
                fk_list = list(feed_keys)
            for fk in fk_list:
                price = self._current_price(fk)
                if price:
                    paper.flatten(price, fk)
        self._strategy.buy = paper_buy
        self._strategy.sell = paper_sell
        self._strategy.flatten = paper_flatten
    # ------------------------------------------------------------------
    # Trading loop
    # ------------------------------------------------------------------
    async def _trading_loop(self) -> None:
        """Poll DataAPI for new bars and call strategy.evaluate() on each update."""
        import pandas as pd
        from dexorder.api import get_api
        from dexorder.nautilus.pandas_strategy import make_feed_key
        api = get_api()
        accumulated: dict[str, list[dict]] = {
            make_feed_key(t, p): [] for t, p in self.feed_configs
        }
        self._current_prices: dict[str, float] = {}
        heartbeat_task = asyncio.create_task(self._heartbeat_loop())
        self._running = True
        try:
            while self._running and not self._stop_event.is_set():
                now = int(time.time())
                updated_any = False
                for ticker, period_seconds in self.feed_configs:
                    fk = make_feed_key(ticker, period_seconds)
                    last_ts_ns = self._last_timestamps.get(fk, 0)
                    # Request last N bars to catch up
                    lookback = now - max(last_ts_ns // 1_000_000_000, now - 7 * 24 * 3600)
                    from_time = lookback if last_ts_ns == 0 else (last_ts_ns // 1_000_000_000)
                    try:
                        df = await api.data.historical_ohlc(
                            ticker=ticker,
                            period_seconds=period_seconds,
                            start_time=from_time,
                            end_time=now,
                            extra_columns=["volume", "buy_vol", "sell_vol",
                                           "open_time", "high_time", "low_time", "close_time",
                                           "open_interest"],
                        )
                    except Exception as e:
                        log.warning("OHLC fetch failed for %s: %s", fk, e)
                        continue
                    if df.empty:
                        continue
                    # Find new bars
                    ts_col = "timestamp" if "timestamp" in df.columns else df.columns[0]
                    new_bars = df[df[ts_col] > last_ts_ns] if last_ts_ns else df
                    for _, row in new_bars.iterrows():
                        ts_ns = int(row.get(ts_col, 0))
                        entry = {
                            "timestamp": ts_ns,
                            "open": float(row.get("open", 0)),
                            "high": float(row.get("high", 0)),
                            "low": float(row.get("low", 0)),
                            "close": float(row.get("close", 0)),
                            "volume": float(row.get("volume", 0)),
                            "buy_vol": float(row.get("buy_vol", 0)) if "buy_vol" in row else None,
                            "sell_vol": float(row.get("sell_vol", 0)) if "sell_vol" in row else None,
                            "open_interest": float(row.get("open_interest", 0)) if "open_interest" in row else None,
                        }
                        accumulated[fk].append(entry)
                        self._last_timestamps[fk] = max(self._last_timestamps.get(fk, 0), ts_ns)
                        self._current_prices[fk] = entry["close"]
                        updated_any = True
                if updated_any:
                    # Build DataFrames and call evaluate
                    dfs = {fk: pd.DataFrame(rows) for fk, rows in accumulated.items() if rows}
                    try:
                        self._strategy.evaluate(dfs)
                    except Exception as e:
                        log.error("evaluate() error: %s", e)
                        await self._push_event("ERROR", {"message": f"evaluate() error: {e}"})
                    # Push PnL update
                    rpnl = self._paper.realized_pnl() if self._paper else 0.0
                    upnl = self._paper.unrealized_pnl(self._current_prices) if self._paper else 0.0
                    await self._push_event("PNL_UPDATE", {
                        "realized_pnl": rpnl,
                        "unrealized_pnl": upnl,
                        "total_pnl": rpnl + upnl,
                        "trade_count": self._paper.trade_count() if self._paper else 0,
                    })
                # Sleep in 1s increments so stop_event is checked promptly
                for _ in range(self.poll_interval):
                    if self._stop_event.is_set():
                        self._running = False
                        break
                    await asyncio.sleep(1)
        finally:
            heartbeat_task.cancel()
            try:
                await heartbeat_task
            except asyncio.CancelledError:
                pass
    async def _heartbeat_loop(self) -> None:
        while True:
            await asyncio.sleep(10)
            await self._push_event("HEARTBEAT", {})
    def _current_price(self, feed_key: str) -> float | None:
        return getattr(self, "_current_prices", {}).get(feed_key)
    # ------------------------------------------------------------------
    # Event publishing
    # ------------------------------------------------------------------
    async def _push_event(self, event_type: str, payload: dict) -> None:
        from dexorder.strategy.events import StrategyEvent, StrategyEventType
        type_map = {
            "STARTED": StrategyEventType.STARTED,
            "STOPPED": StrategyEventType.STOPPED,
            "HEARTBEAT": StrategyEventType.HEARTBEAT,
            "ORDER_FILLED": StrategyEventType.ORDER_FILLED,
            "POSITION_UPDATE": StrategyEventType.POSITION_UPDATE,
            "PNL_UPDATE": StrategyEventType.PNL_UPDATE,
            "ERROR": StrategyEventType.ERROR,
            "LOG": StrategyEventType.LOG,
        }
        et = type_map.get(event_type, StrategyEventType.LOG)
        event = StrategyEvent(
            event_type=et,
            strategy_name=self.strategy_name,
            payload=payload,
        )
        try:
            if self._push_socket:
                self._push_socket.send(event.serialize(), flags=1)  # NOBLOCK
        except Exception as e:
            log.debug("Failed to push event %s: %s", event_type, e)
 def _init_api() -> None:
    """Initialize thread-local API from environment config. Non-fatal on error."""
    try:
        import yaml
        config_path = os.environ.get("CONFIG_PATH", "/app/config/config.yaml")
        secrets_path = os.environ.get("SECRETS_PATH", "/app/config/secrets.yaml")
        config_data, secrets_data = {}, {}
        if Path(config_path).exists():
            with open(config_path) as f:
                config_data = yaml.safe_load(f) or {}
        if Path(secrets_path).exists():
            with open(secrets_path) as f:
                secrets_data = yaml.safe_load(f) or {}
        data_cfg = config_data.get("data", {})
        iceberg_cfg = data_cfg.get("iceberg", {})
        relay_cfg = data_cfg.get("relay", {})
        from dexorder.api import set_api, API
        from dexorder.impl.charting_api_impl import ChartingAPIImpl
        from dexorder.impl.data_api_impl import DataAPIImpl
        data_api = DataAPIImpl(
            iceberg_catalog_uri=iceberg_cfg.get("catalog_uri", "http://iceberg-catalog:8181"),
            relay_endpoint=relay_cfg.get("endpoint", "tcp://relay:5559"),
            notification_endpoint=relay_cfg.get("notification_endpoint", "tcp://relay:5558"),
            namespace=iceberg_cfg.get("namespace", "trading"),
            s3_endpoint=iceberg_cfg.get("s3_endpoint") or secrets_data.get("s3_endpoint"),
            s3_access_key=iceberg_cfg.get("s3_access_key") or secrets_data.get("s3_access_key"),
            s3_secret_key=iceberg_cfg.get("s3_secret_key") or secrets_data.get("s3_secret_key"),
        )
        set_api(API(charting=ChartingAPIImpl(), data=data_api))
    except Exception as e:
        log.warning("API initialization failed: %s", e)
 def run_thread(config: dict, stop_event: threading.Event) -> None:
    """
    Entry point for running a strategy in a daemon thread.
    Initializes a thread-local API, creates a StrategyRunner with the given
    stop_event, and runs the async trading loop until stop_event is set.
    """
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
    )
    _init_api()
    runner = StrategyRunner(config, stop_event=stop_event)
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        loop.run_until_complete(runner.run())
    finally:
        loop.close()
 def main():
    """Subprocess entry point (backward compatibility)."""
    import signal
    parser = argparse.ArgumentParser(description="Dexorder strategy subprocess runner")
    parser.add_argument("--config", required=True, help="Path to JSON config file")
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
    )
    with open(args.config) as f:
        config = json.load(f)
    stop_event = threading.Event()
    def _shutdown(signum, frame):
        log.info("Received signal %d, stopping runner", signum)
        stop_event.set()
    signal.signal(signal.SIGTERM, _shutdown)
    signal.signal(signal.SIGINT, _shutdown)
    run_thread(config, stop_event)
 if __name__ == "__main__":
    main()
--- a/sandbox/dexorder/tools/activate_strategy.py
+++ b/sandbox/dexorder/tools/activate_strategy.py
@@ -1,15 +1,14 @@
 """
-activate_strategy / deactivate_strategy — start and stop live or paper trading.
+activate_strategy / deactivate_strategy / list_active_strategies
 paper=True  (default): forward paper trading — strategy runs on live data with
-            simulated fills. No API keys required.
+            simulated fills via PaperAccount.
-paper=False: live trading — real order execution via user's exchange API keys,
+paper=False: live trading — not yet implemented (requires secrets vault).
             retrieved from the user secrets vault. Currently raises
             NotImplementedError until the vault is implemented.
-Full live-data feed streaming for forward testing is TBD (requires a live bar
+Each activated strategy runs in its own subprocess from a git worktree,
-source). This module establishes the interface and stubs the runtime loop.
+ensuring the production version is isolated from edits in the working tree.
 Events (fills, PnL updates, errors) flow via ZMQ PUSH/PULL to EventPublisher.
 """
 import json
@@ -18,10 +17,6 @@ from typing import Any
 log = logging.getLogger(__name__)
 # Registry of active strategies: {strategy_name → runtime state dict}
 # In a future implementation this will hold live strategy runners.
 _active_strategies: dict[str, dict] = {}
 async def activate_strategy(
    strategy_name: str,
@@ -34,16 +29,14 @@ async def activate_strategy(
    Args:
        strategy_name: Display name as saved via python_write("strategy", ...)
-        feeds: List of feed dicts, e.g. [{"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600}]
+        feeds: List of feed dicts: [{"symbol": "BTC/USDT.BINANCE", "period_seconds": 3600}]
        allocation: Capital allocated in quote currency (e.g. 5000.0 USDT)
-        paper: True = paper/simulated fills (default); False = live execution
+        paper: True = paper/simulated fills (default); False = live (not yet implemented)
    Returns:
        list[TextContent] with JSON:
-        {"status": "activated", "strategy_name": str, "paper": bool, "allocation": float}
+        {"status": "activated", "strategy_name": str, "paper": bool, "allocation": float,
-
+         "git_revision": str, "pid": int}
        On error:
        {"error": str}
    """
    from mcp.types import TextContent
@@ -51,87 +44,45 @@ async def activate_strategy(
        log.error("activate_strategy '%s': %s", strategy_name, msg)
        return [TextContent(type="text", text=json.dumps({"error": msg}))]
-    if strategy_name in _active_strategies:
+    if not paper:
        return _err(
-            f"Strategy '{strategy_name}' is already active. "
+            "Live trading (paper=False) requires the user secrets vault, "
-            "Call deactivate_strategy first."
+            "which is not yet implemented. Use paper=True for paper forward testing."
        )
    if not paper:
        # Live execution requires the user secrets vault for API keys.
        # The vault is not yet implemented.
        try:
            from dexorder.secrets_vault import SecretsVault
            _vault = SecretsVault()
            _vault.get_secret("__probe__")  # will raise NotImplementedError
        except NotImplementedError:
            return _err(
                "Live trading (paper=False) requires the user secrets vault, "
                "which is not yet implemented. Use paper=True for paper forward testing."
            )
    # Validate feeds
    if not feeds:
        return _err("feeds list is empty")
    parsed_feeds: list[tuple[str, int]] = []
    for f in feeds:
-        sym = f.get("symbol", "")
+        if not f.get("symbol"):
        ps = f.get("period_seconds", 3600)
        if not sym:
            return _err(f"Feed entry missing 'symbol': {f}")
        parsed_feeds.append((sym, int(ps)))
-    # TODO: Full implementation — start a live/paper trading loop:
+    try:
-    # 1. Load strategy class from category files
+        from dexorder.strategy.lifecycle import get_strategy_lifecycle
-    # 2. Set up custom indicators via _setup_custom_indicators()
+        lifecycle = get_strategy_lifecycle()
-    # 3. Subscribe to live bar stream for each feed
+        result = await lifecycle.activate(
-    # 4. Initialize paper account (Nautilus SimulatedExchange) or live account
+            strategy_name=strategy_name,
-    # 5. Run strategy event loop (on_bar → evaluate → submit orders)
+            feeds=feeds,
-    # This requires a live data feed adapter (TBD).
+            allocation=allocation,
            paper=paper,
        )
    except Exception as exc:
        log.exception("activate_strategy: lifecycle activation failed")
        return _err(f"Activation failed: {exc}")
-    log.info(
+    if "error" in result:
-        "activate_strategy: registering '%s' (paper=%s, allocation=%.2f) — "
+        return _err(result["error"])
        "live feed loop is TBD",
        strategy_name, paper, allocation,
    )
-    _active_strategies[strategy_name] = {
+    return [TextContent(type="text", text=json.dumps(result))]
        "strategy_name": strategy_name,
        "feeds": [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
        "allocation": allocation,
        "paper": paper,
        "status": "registered",
        "pnl": 0.0,
    }
    payload = {
        "status":          "activated",
        "strategy_name":   strategy_name,
        "paper":           paper,
        "allocation":      allocation,
        "feeds":           [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
        "note":            (
            "Strategy registered. Live data feed streaming is not yet implemented — "
            "forward trading will begin when the live feed adapter is available."
        ),
    }
    return [TextContent(type="text", text=json.dumps(payload))]
 async def deactivate_strategy(strategy_name: str) -> list:
    """
    Deactivate a running strategy and return its final P&L summary.
    Args:
        strategy_name: Display name of the active strategy
    Returns:
        list[TextContent] with JSON:
        {"status": "deactivated", "strategy_name": str, "final_pnl": float}
        On error:
        {"error": str}
    """
    from mcp.types import TextContent
@@ -139,35 +90,36 @@ async def deactivate_strategy(strategy_name: str) -> list:
        log.error("deactivate_strategy '%s': %s", strategy_name, msg)
        return [TextContent(type="text", text=json.dumps({"error": msg}))]
-    if strategy_name not in _active_strategies:
+    try:
-        return _err(f"Strategy '{strategy_name}' is not active")
+        from dexorder.strategy.lifecycle import get_strategy_lifecycle
        lifecycle = get_strategy_lifecycle()
        result = await lifecycle.deactivate(strategy_name)
    except Exception as exc:
        log.exception("deactivate_strategy: failed")
        return _err(f"Deactivation failed: {exc}")
-    state = _active_strategies.pop(strategy_name)
+    if "error" in result:
        return _err(result["error"])
-    # TODO: Stop the live feed loop and collect final P&L from the running engine.
+    return [TextContent(type="text", text=json.dumps(result))]
    final_pnl = state.get("pnl", 0.0)
    log.info("deactivate_strategy: stopped '%s', final_pnl=%.4f", strategy_name, final_pnl)
    payload = {
        "status":        "deactivated",
        "strategy_name": strategy_name,
        "final_pnl":     final_pnl,
    }
    return [TextContent(type="text", text=json.dumps(payload))]
 async def list_active_strategies() -> list:
    """
-    Return a list of currently active strategies and their status.
+    Return a list of currently active strategies with PnL state.
    Returns:
        list[TextContent] with JSON:
-        {"active_strategies": [{strategy_name, paper, allocation, feeds, pnl}, ...]}
+        {"active_strategies": [{strategy_name, paper, allocation, feeds, realized_pnl, ...}]}
    """
    from mcp.types import TextContent
-    payload = {
+    try:
-        "active_strategies": list(_active_strategies.values()),
+        from dexorder.strategy.lifecycle import get_strategy_lifecycle
-    }
+        lifecycle = get_strategy_lifecycle()
-    return [TextContent(type="text", text=json.dumps(payload))]
+        active = await lifecycle.list_active()
    except Exception as exc:
        log.exception("list_active_strategies: failed")
        active = []
    return [TextContent(type="text", text=json.dumps({"active_strategies": active}))]
--- a/sandbox/dexorder/tools/backtest_strategy.py
+++ b/sandbox/dexorder/tools/backtest_strategy.py
@@ -15,7 +15,11 @@ from typing import Any
 log = logging.getLogger(__name__)
 # All OHLC+ columns to request from the DataAPI
-_OHLC_EXTRA_COLUMNS = ["volume", "buy_vol", "sell_vol", "open_interest"]
+_OHLC_EXTRA_COLUMNS = [
    "volume", "buy_vol", "sell_vol",
    "open_time", "high_time", "low_time", "close_time",
    "open_interest",
 ]
 async def backtest_strategy(
@@ -153,11 +157,11 @@ async def backtest_strategy(
    # --- 7. Return results ---
    payload = {
-        "strategy_name":  strategy_name,
+        "strategy_name":   strategy_name,
-        "feeds": [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
+        "feeds":           [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
        "initial_capital": initial_capital,
        "paper":           paper,
        "total_candles":   total_candles,
-        **metrics,
+        **metrics,  # keys: summary, statistics, trades, equity_curve
    }
    return [TextContent(type="text", text=json.dumps(payload))]
--- a/sandbox/dexorder/tools/indicator_harness.py
+++ b/sandbox/dexorder/tools/indicator_harness.py
@@ -2,9 +2,10 @@
 """
 Indicator harness — tests a custom indicator against synthetic OHLC data.
-Runs in a subprocess so the indicator code is isolated from the MCP server process.
+Can be called in-process (preferred) via run() or as a subprocess for backward
 compatibility.
-Usage: python indicator_harness.py <impl_path> <metadata_path>
+Usage (subprocess): python indicator_harness.py <impl_path> <metadata_path>
 Outputs JSON to stdout:
 {
@@ -21,7 +22,7 @@ import traceback
 import types
 from pathlib import Path
-# Ensure dexorder package is importable (same as research_harness.py)
+# Ensure dexorder package is importable when run as a subprocess
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
@@ -84,13 +85,15 @@ def summarize(result, n: int) -> str:
        return f"Unexpected return type: {type(result).__name__}"
-def main():
+def run(impl_path: Path, metadata_path: Path) -> dict:
-    if len(sys.argv) < 3:
+    """
-        print(json.dumps({"success": False, "error": "Usage: indicator_harness.py <impl_path> <metadata_path>"}))
+    Run an indicator against synthetic OHLC data and return results.
        sys.exit(1)
-    impl_path = sys.argv[1]
+    Returns:
-    metadata_path = sys.argv[2]
+        dict with success, output, error fields
    """
    impl_path = Path(impl_path)
    metadata_path = Path(metadata_path)
    # --- Load metadata ---
    input_series = ["close"]
@@ -107,34 +110,32 @@ def main():
                # bare value (legacy)
                parameters[pname] = pinfo
    except Exception as e:
-        print(json.dumps({"success": False, "error": f"Failed to read metadata: {e}"}))
+        return {"success": False, "error": f"Failed to read metadata: {e}"}
        sys.exit(0)
    # --- Generate synthetic data ---
    try:
        import numpy  # noqa: F401 — verify numpy available
        import pandas as pd
    except ImportError as e:
-        print(json.dumps({"success": False, "error": f"Missing required package: {e}"}))
+        return {"success": False, "error": f"Missing required package: {e}"}
        sys.exit(0)
    df = make_synthetic_ohlcv(n=200)
    n = len(df)
    # --- Load implementation ---
    # Clear from sys.modules first so edits are picked up
    module_name = f"_dexorder_indicator_{impl_path.parent.name}"
    sys.modules.pop(module_name, None)
    try:
-        spec = importlib.util.spec_from_file_location("_indicator_impl", impl_path)
+        spec = importlib.util.spec_from_file_location(module_name, impl_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)  # type: ignore[union-attr]
    except Exception:
        tb = traceback.format_exc()
-        print(json.dumps({"success": False, "error": f"Import failed:\n{tb}"}))
+        return {"success": False, "error": f"Import failed:\n{tb}"}
        sys.exit(0)
    # --- Find the indicator function ---
-    # Prefer a function whose name matches the sanitized directory name,
+    fn_name = impl_path.parent.name.lower()
    # fall back to the first public function in the module.
    fn_name = os.path.basename(os.path.dirname(impl_path)).lower()
    fn = getattr(module, fn_name, None)
    if fn is None:
        candidates = [
@@ -144,15 +145,13 @@ def main():
        fn = candidates[0] if candidates else None
    if fn is None:
-        print(json.dumps({"success": False, "error": "No callable function found in implementation.py"}))
+        return {"success": False, "error": "No callable function found in implementation.py"}
        sys.exit(0)
    # --- Build positional args from input_series ---
    args = []
    for col in input_series:
        if col not in df.columns:
-            print(json.dumps({"success": False, "error": f"input_series '{col}' not in synthetic df columns {list(df.columns)}"}))
+            return {"success": False, "error": f"input_series '{col}' not in synthetic df columns {list(df.columns)}"}
            sys.exit(0)
        args.append(df[col])
    # --- Execute ---
@@ -160,22 +159,29 @@ def main():
        result = fn(*args, **parameters)
    except Exception:
        tb = traceback.format_exc()
-        print(json.dumps({"success": False, "error": f"Execution failed:\n{tb}"}))
+        return {"success": False, "error": f"Execution failed:\n{tb}"}
        sys.exit(0)
    # --- Validate output type ---
    if not isinstance(result, (pd.Series, pd.DataFrame)):
-        print(json.dumps({
+        return {
            "success": False,
            "error": (
                f"Indicator must return pd.Series or pd.DataFrame, "
                f"got {type(result).__name__}. "
                "Wrap the output if using pandas-ta internally."
            ),
-        }))
+        }
        sys.exit(0)
-    print(json.dumps({"success": True, "output": summarize(result, n)}))
+    return {"success": True, "output": summarize(result, n)}
 def main():
    if len(sys.argv) < 3:
        print(json.dumps({"success": False, "error": "Usage: indicator_harness.py <impl_path> <metadata_path>"}))
        sys.exit(1)
    result = run(Path(sys.argv[1]), Path(sys.argv[2]))
    print(json.dumps(result))
 if __name__ == "__main__":
--- a/sandbox/dexorder/tools/python_tools.py
+++ b/sandbox/dexorder/tools/python_tools.py
@@ -18,11 +18,13 @@ After write/edit operations, a category-specific test harness runs to validate
 the code and capture errors/output for agent feedback.
 """
 import concurrent.futures
 import json
 import logging
 import re
 import subprocess
 import sys
 import traceback
 from dataclasses import dataclass, asdict
 from enum import Enum
 from pathlib import Path
@@ -30,16 +32,37 @@ from typing import Any, Optional
 log = logging.getLogger(__name__)
 # Path to the harness scripts (written to disk, not inline)
 _RESEARCH_HARNESS = Path(__file__).parent / "research_harness.py"
 _INDICATOR_HARNESS = Path(__file__).parent / "indicator_harness.py"
-# Import conda manager for package installation
+def _run_inprocess(fn, *args, timeout: int) -> dict:
    """
    Run fn(*args) in a one-shot thread and return its result dict.
    Uses a thread so the calling coroutine is not blocked and the calling
    process does not fork a new Python interpreter. All already-loaded
    libraries (numpy, pandas, matplotlib, etc.) are shared with the thread.
    On timeout returns a dict with _timeout=True. On unexpected exception
    returns a dict with error=True and the traceback in stderr.
    """
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(fn, *args)
        try:
            return future.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            return {"_timeout": True, "error": True,
                    "stdout": "", "stderr": "", "images": []}
        except Exception:
            return {"error": True, "stdout": "",
                    "stderr": traceback.format_exc(), "images": []}
 # Import conda manager for package installation and tracking
 try:
-    from dexorder.conda_manager import install_packages
+    from dexorder.conda_manager import install_packages, cleanup_extra_packages
 except ImportError:
    log.warning("conda_manager not available - package installation disabled")
    install_packages = None
    cleanup_extra_packages = None
 # =============================================================================
@@ -289,6 +312,49 @@ class GitManager:
        except subprocess.CalledProcessError as e:
            raise RuntimeError(e.stderr.strip()) from e
    def head_short_hash(self) -> str:
        """Return the short hash of HEAD, or 'unknown' on error."""
        try:
            result = self._run("rev-parse", "--short", "HEAD")
            return result.stdout.strip()
        except Exception:
            return "unknown"
    def create_worktree(self, worktree_path: Path, revision: str = "HEAD") -> str:
        """
        Create a git worktree at worktree_path pinned to revision.
        Returns the short hash of the checked-out commit.
        """
        worktree_path.parent.mkdir(parents=True, exist_ok=True)
        try:
            self._run("worktree", "add", "--detach", str(worktree_path), revision)
            # Get short hash of the worktree's HEAD
            result = subprocess.run(
                ["git", "rev-parse", "--short", "HEAD"],
                cwd=str(worktree_path),
                capture_output=True,
                text=True,
                check=True,
            )
            return result.stdout.strip()
        except subprocess.CalledProcessError as e:
            raise RuntimeError(f"git worktree add failed: {e.stderr.strip()}") from e
    def remove_worktree(self, worktree_path: Path) -> None:
        """Remove a git worktree, silently ignoring errors if it no longer exists."""
        try:
            self._run("worktree", "remove", "--force", str(worktree_path), check=False)
        except Exception as e:
            log.warning("git worktree remove failed (non-fatal): %s", e)
    def prune_worktrees(self) -> None:
        """Prune stale worktree references."""
        try:
            self._run("worktree", "prune", check=False)
        except Exception:
            pass
 # =============================================================================
 # Custom Indicator Setup
@@ -733,7 +799,7 @@ class CategoryFileManager:
                conda_packages = metadata.get("conda_packages", [])
                if conda_packages:
                    log.info(f"Installing packages for validation: {conda_packages}")
-                    install_result = install_packages(conda_packages)
+                    install_result = install_packages(conda_packages, data_dir=self.data_dir)
                    if install_result.get("success"):
                        packages_installed = install_result.get("installed", [])
                        if packages_installed:
@@ -761,48 +827,49 @@ class CategoryFileManager:
    def _validate_strategy(self, impl_path: Path) -> dict[str, Any]:
        """
-        Validate a strategy implementation.
+        Validate a strategy by running it against synthetic OHLC data.
-        Runs basic syntax check and imports.
+        Runs strategy_harness.py in-process via a thread. Catches import errors,
        runtime errors in evaluate(), and wrong class hierarchy — not just syntax.
        """
-        try:
+        meta_path = impl_path.parent / "metadata.json"
-            result = subprocess.run(
+        return self._execute_strategy(impl_path.parent, timeout=45)
                [sys.executable, "-m", "py_compile", str(impl_path)],
                capture_output=True,
                text=True,
                timeout=10,
            )
-            if result.returncode == 0:
+    def _execute_strategy(self, item_dir: Path, timeout: int = 45) -> dict[str, Any]:
-                return {
+        """
-                    "success": True,
+        Run a strategy against synthetic OHLC data in-process via a thread.
-                    "output": "Strategy syntax valid",
+
-                }
+        Returns:
-            else:
+            dict with success, output (human-readable summary), trade_count, error
-                return {
+        """
-                    "success": False,
+        impl_path = item_dir / "implementation.py"
-                    "output": result.stderr,
+        meta_path = item_dir / "metadata.json"
-                    "error": "Syntax error in strategy",
+
-                }
+        if not impl_path.exists():
-        except subprocess.TimeoutExpired:
+            return {"success": False, "error": "implementation.py not found"}
-            return {"success": False, "error": "Validation timeout"}
+        if not meta_path.exists():
-        except Exception as e:
+            return {"success": False, "error": "metadata.json not found"}
-            return {"success": False, "error": f"Validation failed: {e}"}
+
        from dexorder.tools.strategy_harness import run as _strategy_run
        result = _run_inprocess(_strategy_run, impl_path, meta_path, timeout=timeout)
        if result.get("_timeout"):
            return {"success": False, "error": f"Strategy test timed out after {timeout}s"}
        return result
    def _validate_indicator(self, impl_path: Path) -> dict[str, Any]:
        """
        Validate an indicator by running it against synthetic OHLC data.
-        Uses indicator_harness.py in a subprocess so the indicator code is
+        Runs indicator_harness.py in-process via a thread. Catches import errors,
-        isolated from the MCP server process. Catches import errors, runtime
+        runtime errors, and wrong return types — not just syntax.
        errors, and wrong return types — not just syntax.
        """
        meta_path = impl_path.parent / "metadata.json"
        return self._execute_indicator(impl_path.parent, timeout=30)
    def _execute_indicator(self, item_dir: Path, timeout: int = 30) -> dict[str, Any]:
        """
-        Run an indicator against synthetic OHLC data via indicator_harness.py.
+        Run an indicator against synthetic OHLC data in-process via a thread.
        Returns:
            dict with success, output (human-readable summary), error
@@ -815,77 +882,22 @@ class CategoryFileManager:
        if not meta_path.exists():
            return {"success": False, "error": "metadata.json not found"}
-        try:
+        from dexorder.tools.indicator_harness import run as _indicator_run
-            result = subprocess.run(
+        result = _run_inprocess(_indicator_run, impl_path, meta_path, timeout=timeout)
-                [sys.executable, str(_INDICATOR_HARNESS), str(impl_path), str(meta_path)],
+
-                capture_output=True,
+        if result.get("_timeout"):
                text=True,
                timeout=timeout,
                cwd=str(item_dir),
            )
        except subprocess.TimeoutExpired:
            return {"success": False, "error": f"Indicator test timed out after {timeout}s"}
-        except Exception as e:
+        return result
            return {"success": False, "error": f"Harness launch failed: {e}"}
-        if result.returncode != 0:
+    def _run_research_harness(self, impl_path: Path, item_dir: Path, timeout: int = 300) -> dict[str, Any]:
            return {
                "success": False,
                "error": f"Harness process failed:\n{result.stderr}",
            }
        try:
            data = json.loads(result.stdout)
        except json.JSONDecodeError:
            return {
                "success": False,
                "error": f"Harness produced invalid JSON:\n{result.stdout[:500]}",
            }
        return data
    def _run_research_harness(self, impl_path: Path, item_dir: Path, timeout: int = 30) -> dict[str, Any]:
        """
-        Run a research script via the on-disk harness and return parsed results.
+        Run a research script in-process via a thread and return captured results.
        The harness (research_harness.py) handles API initialization, stdout/stderr
        capture, matplotlib figure capture, and outputs JSON to stdout.
        Returns:
            dict with stdout, stderr, images, error fields — or an error dict.
        """
-        try:
+        from dexorder.tools.research_harness import run as _research_run
-            result = subprocess.run(
+        return _run_inprocess(_research_run, impl_path, item_dir, timeout=timeout)
                [sys.executable, str(_RESEARCH_HARNESS), str(impl_path)],
                capture_output=True,
                text=True,
                timeout=timeout,
                cwd=str(item_dir),
            )
            if result.returncode == 0:
                try:
                    return json.loads(result.stdout)
                except json.JSONDecodeError:
                    return {
                        "stdout": result.stdout,
                        "stderr": result.stderr,
                        "images": [],
                        "error": True,
                    }
            else:
                # Harness itself failed (import error, bad args, etc.)
                return {
                    "stdout": "",
                    "stderr": result.stderr,
                    "images": [],
                    "error": True,
                }
        except subprocess.TimeoutExpired:
            return {"stdout": "", "stderr": "", "images": [], "error": True,
                    "_timeout": True}
        except Exception as e:
            return {"stdout": "", "stderr": str(e), "images": [], "error": True}
    def _validate_research(self, impl_path: Path, item_dir: Path) -> dict[str, Any]:
        """
@@ -893,7 +905,7 @@ class CategoryFileManager:
        Runs the script via the harness and captures output + pyplot images.
        """
-        data = self._run_research_harness(impl_path, item_dir, timeout=30)
+        data = self._run_research_harness(impl_path, item_dir, timeout=300)
        if data.get("_timeout"):
            return {"success": False, "error": "Research script timeout"}
@@ -983,6 +995,48 @@ class CategoryFileManager:
        return {"content": content}
    def delete(self, category: str, name: str) -> dict[str, Any]:
        """
        Delete a category script directory and commit the removal to git.
        Args:
            category: Category name (strategy, indicator, research)
            name: Display name of the item to delete
        Returns:
            dict with:
            - success: bool
            - category: str
            - name: str
            - revision: str - git commit hash of the deletion commit
            - error: str (if any)
        """
        import shutil
        try:
            cat = Category(category)
        except ValueError:
            return {
                "success": False,
                "error": f"Invalid category '{category}'. Must be one of: {', '.join(c.value for c in Category)}"
            }
        item_dir = get_category_path(self.src_dir, cat, name)
        if not item_dir.exists():
            return {"success": False, "error": f"{category} '{name}' not found"}
        try:
            shutil.rmtree(item_dir)
            log.info(f"Deleted {cat.value}: {item_dir}")
        except Exception as e:
            return {"success": False, "error": f"Failed to delete: {e}"}
        commit_hash = self.git.commit(f"delete({category}): {name}")
        result: dict[str, Any] = {"success": True, "category": category, "name": name}
        if commit_hash:
            result["revision"] = commit_hash
        return result
    def git_log(
        self,
        category: Optional[str] = None,
--- a/sandbox/dexorder/tools/research_harness.py
+++ b/sandbox/dexorder/tools/research_harness.py
@@ -1,13 +1,12 @@
 #!/usr/bin/env python3
 """
-Research script harness - runs implementation.py in a subprocess with API
+Research script harness - runs implementation.py with API initialization,
-initialization, stdout/stderr capture, and matplotlib figure capture.
+stdout/stderr capture, and matplotlib figure capture.
-This file is written to disk and invoked by python_tools.py rather than
+Can be called in-process (preferred) via run() or as a subprocess for backward
-being passed inline via `python -c`, so the harness code is inspectable and
+compatibility.
 not regenerated on every call.
-Usage:
+Usage (subprocess):
    python -m dexorder.tools.research_harness <implementation_path>
 Output (JSON to stdout):
@@ -19,73 +18,148 @@ Output (JSON to stdout):
    }
 """
 import sys
 import io
 import os
 import base64
 import json
 import sys
 import traceback
 from pathlib import Path
-# Non-interactive matplotlib backend (must be set before importing pyplot)
+# Non-interactive matplotlib backend (must be set before importing pyplot).
 # Idempotent — safe to call multiple times.
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
-# Ensure dexorder package is importable
+# Ensure dexorder package is importable when run as a subprocess
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
 # ---------------------------------------------------------------------------
 # Initialize API from config files so research scripts can call get_api()
 # ---------------------------------------------------------------------------
 try:
    import yaml
-    config_path = os.environ.get("CONFIG_PATH", "/app/config/config.yaml")
+def run(impl_path: Path, item_dir: Path) -> dict:
-    secrets_path = os.environ.get("SECRETS_PATH", "/app/config/secrets.yaml")
+    """
    Run a research script in-process and return captured results.
-    config_data = {}
+    Creates a fresh DataAPIImpl per call (thread-safe: API stored in thread-local
-    secrets_data = {}
+    via set_api() so the global API is not overwritten).
    if Path(config_path).exists():
        with open(config_path) as f:
            config_data = yaml.safe_load(f) or {}
    if Path(secrets_path).exists():
        with open(secrets_path) as f:
            secrets_data = yaml.safe_load(f) or {}
-    data_cfg = config_data.get("data", {})
+    Returns:
-    iceberg_cfg = data_cfg.get("iceberg", {})
+        dict with stdout, stderr, images, error fields
-    relay_cfg = data_cfg.get("relay", {})
+    """
    impl_path = Path(impl_path)
-    from dexorder.api import set_api, API
+    if not impl_path.exists():
-    from dexorder.impl.charting_api_impl import ChartingAPIImpl
+        return {
-    from dexorder.impl.data_api_impl import DataAPIImpl
+            "stdout": "",
            "stderr": f"Implementation file not found: {impl_path}",
            "images": [],
            "error": True,
        }
-    _data_api = DataAPIImpl(
+    # ---------------------------------------------------------------------------
-        iceberg_catalog_uri=iceberg_cfg.get("catalog_uri", "http://iceberg-catalog:8181"),
+    # Initialize a fresh API instance for this execution (thread-local)
-        relay_endpoint=relay_cfg.get("endpoint", "tcp://relay:5559"),
+    # ---------------------------------------------------------------------------
-        notification_endpoint=relay_cfg.get("notification_endpoint", "tcp://relay:5558"),
+    try:
-        namespace=iceberg_cfg.get("namespace", "trading"),
+        import yaml
        s3_endpoint=iceberg_cfg.get("s3_endpoint") or secrets_data.get("s3_endpoint"),
        s3_access_key=iceberg_cfg.get("s3_access_key") or secrets_data.get("s3_access_key"),
        s3_secret_key=iceberg_cfg.get("s3_secret_key") or secrets_data.get("s3_secret_key"),
    )
    # NOTE: We intentionally do NOT call asyncio.run(_data_api.start()) here.
    # DataAPIImpl.historical_ohlc() auto-starts on first use, which ensures the
    # ZMQ context and notification listener are created inside the user's own
    # asyncio.run() event loop — avoiding cross-loop lifecycle issues.
    set_api(API(charting=ChartingAPIImpl(), data=_data_api))
 except Exception as e:
    print(f"WARNING: API initialization failed: {e}", file=sys.stderr)
-# ---------------------------------------------------------------------------
+        config_path = os.environ.get("CONFIG_PATH", "/app/config/config.yaml")
-# Register custom indicators so research scripts can use df.ta.my_indicator()
+        secrets_path = os.environ.get("SECRETS_PATH", "/app/config/secrets.yaml")
-# ---------------------------------------------------------------------------
+
-try:
+        config_data = {}
-    from dexorder.tools.python_tools import setup_custom_indicators
+        secrets_data = {}
-    _data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
+        if Path(config_path).exists():
-    setup_custom_indicators(_data_dir)
+            with open(config_path) as f:
-except Exception as e:
+                config_data = yaml.safe_load(f) or {}
-    print(f"WARNING: Custom indicator registration failed: {e}", file=sys.stderr)
+        if Path(secrets_path).exists():
            with open(secrets_path) as f:
                secrets_data = yaml.safe_load(f) or {}
        data_cfg = config_data.get("data", {})
        iceberg_cfg = data_cfg.get("iceberg", {})
        relay_cfg = data_cfg.get("relay", {})
        from dexorder.api import set_api, API
        from dexorder.impl.charting_api_impl import ChartingAPIImpl
        from dexorder.impl.data_api_impl import DataAPIImpl
        _data_api = DataAPIImpl(
            iceberg_catalog_uri=iceberg_cfg.get("catalog_uri", "http://iceberg-catalog:8181"),
            relay_endpoint=relay_cfg.get("endpoint", "tcp://relay:5559"),
            notification_endpoint=relay_cfg.get("notification_endpoint", "tcp://relay:5558"),
            namespace=iceberg_cfg.get("namespace", "trading"),
            s3_endpoint=iceberg_cfg.get("s3_endpoint") or secrets_data.get("s3_endpoint"),
            s3_access_key=iceberg_cfg.get("s3_access_key") or secrets_data.get("s3_access_key"),
            s3_secret_key=iceberg_cfg.get("s3_secret_key") or secrets_data.get("s3_secret_key"),
            s3_region=iceberg_cfg.get("s3_region") or secrets_data.get("s3_region"),
            request_timeout=240.0,
        )
        # NOTE: We intentionally do NOT call asyncio.run(_data_api.start()) here.
        # DataAPIImpl.historical_ohlc() auto-starts on first use, which ensures the
        # ZMQ context and notification listener are created inside the user's own
        # asyncio.run() event loop — avoiding cross-loop lifecycle issues.
        # In a harness thread, set_api() stores to thread-local (not the global).
        set_api(API(charting=ChartingAPIImpl(), data=_data_api))
    except Exception as e:
        # Non-fatal — script may not use the API
        sys.stderr.write(f"WARNING: API initialization failed: {e}\n")
    # ---------------------------------------------------------------------------
    # Register custom indicators
    # ---------------------------------------------------------------------------
    try:
        from dexorder.tools.python_tools import setup_custom_indicators
        _data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
        setup_custom_indicators(_data_dir)
    except Exception as e:
        sys.stderr.write(f"WARNING: Custom indicator registration failed: {e}\n")
    # ---------------------------------------------------------------------------
    # Execute user script with captured stdout/stderr
    # ---------------------------------------------------------------------------
    stdout_buf = io.StringIO()
    stderr_buf = io.StringIO()
    error_occurred = False
    old_stdout, old_stderr = sys.stdout, sys.stderr
    old_cwd = os.getcwd()
    sys.stdout = stdout_buf
    sys.stderr = stderr_buf
    try:
        os.chdir(impl_path.parent)
        exec(compile(impl_path.read_text(), str(impl_path), 'exec'), {})
    except Exception as e:
        print(f"ERROR: {e}", file=sys.stderr)
        traceback.print_exc(file=sys.stderr)
        error_occurred = True
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        os.chdir(old_cwd)
    stdout_output = stdout_buf.getvalue()
    stderr_output = stderr_buf.getvalue()
    # ---------------------------------------------------------------------------
    # Capture matplotlib figures
    # ---------------------------------------------------------------------------
    images = []
    if not error_occurred:
        for fig_num in plt.get_fignums():
            fig = plt.figure(fig_num)
            buf = io.BytesIO()
            fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
            buf.seek(0)
            images.append({"format": "png", "data": base64.b64encode(buf.read()).decode('utf-8')})
            buf.close()
    plt.close('all')
    return {
        "stdout": stdout_output,
        "stderr": stderr_output,
        "images": images,
        "error": error_occurred,
    }
 def main():
@@ -94,55 +168,8 @@ def main():
        sys.exit(2)
    impl_path = Path(sys.argv[1])
-    if not impl_path.exists():
+    item_dir = impl_path.parent
-        print(json.dumps({
+    result = run(impl_path, item_dir)
            "stdout": "",
            "stderr": f"Implementation file not found: {impl_path}",
            "images": [],
            "error": True,
        }))
        sys.exit(0)
    # Capture stdout and stderr
    old_stdout = sys.stdout
    old_stderr = sys.stderr
    sys.stdout = io.StringIO()
    sys.stderr = io.StringIO()
    error_occurred = False
    try:
        exec(compile(impl_path.read_text(), str(impl_path), 'exec'), {})
    except Exception as e:
        print(f"ERROR: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc(file=sys.stderr)
        error_occurred = True
    # Restore stdout/stderr
    stdout_output = sys.stdout.getvalue()
    stderr_output = sys.stderr.getvalue()
    sys.stdout = old_stdout
    sys.stderr = old_stderr
    # Capture all matplotlib figures as base64 PNGs
    images = []
    for fig_num in plt.get_fignums():
        fig = plt.figure(fig_num)
        buf = io.BytesIO()
        fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
        buf.seek(0)
        img_b64 = base64.b64encode(buf.read()).decode('utf-8')
        images.append({"format": "png", "data": img_b64})
        buf.close()
    plt.close('all')
    # Output results as JSON to real stdout
    result = {
        "stdout": stdout_output,
        "stderr": stderr_output,
        "images": images,
        "error": error_occurred,
    }
    print(json.dumps(result))
--- a/sandbox/dexorder/tools/strategy_harness.py
+++ b/sandbox/dexorder/tools/strategy_harness.py
@@ -0,0 +1,228 @@
 #!/usr/bin/env python3
 """
 Strategy harness — validates a PandasStrategy against synthetic OHLC data.
 Can be called in-process (preferred) via run() or as a subprocess for backward
 compatibility.
 Usage (subprocess): python strategy_harness.py <impl_path> <metadata_path>
 Outputs JSON to stdout:
 {
    "success": bool,
    "output": str,       # human-readable summary on success
    "trade_count": int,  # number of trades executed in the mini-backtest
    "error": str | null  # error message / traceback if failed
 }
 """
 import json
 import os
 import sys
 import traceback
 from pathlib import Path
 # Ensure dexorder package is importable when run as a subprocess
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
 # ---------------------------------------------------------------------------
 # Synthetic OHLCV data — 100 deterministic bars, no network required
 # ---------------------------------------------------------------------------
 def make_synthetic_ohlcv(n: int = 100):
    import numpy as np
    import pandas as pd
    rng = np.random.default_rng(42)
    returns = rng.normal(0, 0.015, n)
    closes = 40_000.0 * np.cumprod(1.0 + returns)
    opens = np.empty(n)
    opens[0] = closes[0]
    opens[1:] = closes[:-1]
    noise = np.abs(rng.normal(0, 0.005, n))
    highs = np.maximum(opens, closes) * (1.0 + noise)
    lows  = np.minimum(opens, closes) * (1.0 - noise)
    volumes = rng.uniform(1e6, 1e8, n)
    buy_vols = volumes * rng.uniform(0.4, 0.6, n)
    now_ns = 1_700_000_000_000_000_000  # arbitrary epoch in nanoseconds
    step_ns = 3_600_000_000_000         # 1 hour in nanoseconds
    timestamps = [now_ns + i * step_ns for i in range(n)]
    return pd.DataFrame({
        "timestamp": timestamps,
        "open":      opens,
        "high":      highs,
        "low":       lows,
        "close":     closes,
        "volume":    volumes,
        "buy_vol":   buy_vols,
        "sell_vol":  volumes - buy_vols,
        "open_interest": rng.uniform(1e8, 1e9, n),
    })
 def run(impl_path: Path, metadata_path: Path) -> dict:
    """
    Validate a strategy against synthetic OHLC data and return results.
    Returns:
        dict with success, output, trade_count, error fields
    """
    impl_path = Path(impl_path)
    metadata_path = Path(metadata_path)
    # --- Load metadata (feeds, parameters) ---
    data_feeds: list[dict] = []
    parameters: dict = {}
    try:
        with open(metadata_path) as f:
            meta = json.load(f)
        data_feeds = meta.get("data_feeds") or []
        param_schema = meta.get("parameters") or {}
        for pname, pinfo in param_schema.items():
            if isinstance(pinfo, dict) and "default" in pinfo:
                parameters[pname] = pinfo["default"]
            elif not isinstance(pinfo, dict):
                parameters[pname] = pinfo
    except Exception as e:
        return {"success": False, "output": "", "trade_count": 0, "error": f"Failed to read metadata: {e}"}
    # --- Build synthetic feed keys ---
    if data_feeds:
        feed_configs = [(f.get("symbol", "BTC/USDT.SYNTH"), int(f.get("period_seconds", 3600)))
                        for f in data_feeds]
    else:
        feed_configs = [("BTC/USDT.SYNTH", 3600)]
    # --- Register custom indicators ---
    try:
        from dexorder.tools.python_tools import setup_custom_indicators
        data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
        setup_custom_indicators(data_dir)
    except Exception:
        pass
    # --- Load strategy class ---
    try:
        from dexorder.nautilus.backtest_runner import _load_strategy_class
        strategy_class = _load_strategy_class(impl_path)
    except Exception:
        tb = traceback.format_exc()
        return {"success": False, "output": "", "trade_count": 0, "error": f"Strategy load failed:\n{tb}"}
    # --- Run a minimal backtest with synthetic data ---
    try:
        import pandas as pd
        from dexorder.nautilus.pandas_strategy import PandasStrategyConfig, make_feed_key
        from dexorder.nautilus.backtest_runner import _setup_custom_indicators
        try:
            data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
            _setup_custom_indicators(data_dir)
        except Exception:
            pass
        # Build one synthetic DataFrame per feed
        feed_dfs: dict[str, pd.DataFrame] = {}
        for ticker, period_seconds in feed_configs:
            fk = make_feed_key(ticker, period_seconds)
            feed_dfs[fk] = make_synthetic_ohlcv(100)
        feed_keys = tuple(make_feed_key(t, p) for t, p in feed_configs)
        config = PandasStrategyConfig(
            strategy_id=f"{strategy_class.__name__}-HARNESS",
            feed_keys=feed_keys,
            initial_capital=10_000.0,
        )
        strat = strategy_class(config=config)
        for pname, pval in parameters.items():
            if hasattr(strat, pname):
                setattr(strat, pname, pval)
        # Replay bars: accumulate rows and call evaluate()
        buy_count = 0
        sell_count = 0
        evaluate_errors: list[str] = []
        rows_by_feed: dict[str, list] = {fk: [] for fk in feed_keys}
        for i in range(len(next(iter(feed_dfs.values())))):
            for fk, df in feed_dfs.items():
                row = df.iloc[i].to_dict()
                rows_by_feed[fk].append(row)
                current_dfs = {k: pd.DataFrame(v) for k, v in rows_by_feed.items()}
                _orig_buy = strat.buy
                _orig_sell = strat.sell
                _orig_flatten = strat.flatten
                class _BuyCounter:
                    def __call__(inner_self, *a, **kw):
                        nonlocal buy_count
                        buy_count += 1
                class _SellCounter:
                    def __call__(inner_self, *a, **kw):
                        nonlocal sell_count
                        sell_count += 1
                strat.buy = _BuyCounter()
                strat.sell = _SellCounter()
                strat.flatten = lambda *a, **kw: None
                try:
                    strat.evaluate(current_dfs)
                except Exception as e:
                    evaluate_errors.append(f"Bar {i}: {e}")
                    if len(evaluate_errors) > 3:
                        break
                finally:
                    strat.buy = _orig_buy
                    strat.sell = _orig_sell
                    strat.flatten = _orig_flatten
            if evaluate_errors and len(evaluate_errors) > 3:
                break
        if evaluate_errors:
            return {
                "success": False,
                "output": "",
                "trade_count": 0,
                "error": "evaluate() raised errors:\n" + "\n".join(evaluate_errors[:3]),
            }
        trade_count = buy_count + sell_count
        n_bars = len(next(iter(feed_dfs.values())))
        n_feeds = len(feed_dfs)
        output = (
            f"Strategy validated OK: {n_bars} bars × {n_feeds} feed(s), "
            f"buy_signals={buy_count}, sell_signals={sell_count}"
        )
        return {"success": True, "output": output, "trade_count": trade_count, "error": None}
    except Exception:
        tb = traceback.format_exc()
        return {"success": False, "output": "", "trade_count": 0, "error": f"Harness execution failed:\n{tb}"}
 def main():
    if len(sys.argv) < 3:
        print(json.dumps({
            "success": False,
            "output": "",
            "trade_count": 0,
            "error": "Usage: strategy_harness.py <impl_path> <metadata_path>",
        }))
        sys.exit(1)
    result = run(Path(sys.argv[1]), Path(sys.argv[2]))
    print(json.dumps(result))
 if __name__ == "__main__":
    main()
--- a/sandbox/environment.yml
+++ b/sandbox/environment.yml
@@ -51,3 +51,4 @@ dependencies:
    - uvicorn>=0.27.0
    - sse-starlette>=1.6.0
    - nautilus_trader>=1.200.0
    - aiosqlite>=0.19.0
--- a/sandbox/main.py
+++ b/sandbox/main.py
@@ -11,6 +11,7 @@ Brings together:
 import asyncio
 import contextlib
 import json
 import logging
 import os
 import signal
@@ -32,7 +33,7 @@ from starlette.routing import Route, Mount
 from dexorder import EventPublisher, start_lifecycle_manager, get_lifecycle_manager
 from dexorder.api import set_api, API
-from dexorder.conda_manager import sync_packages, install_packages
+from dexorder.conda_manager import sync_packages, install_packages, cleanup_extra_packages
 from dexorder.events import EventType, UserEvent, DeliverySpec
 from dexorder.impl.charting_api_impl import ChartingAPIImpl
 from dexorder.impl.data_api_impl import DataAPIImpl
@@ -41,6 +42,8 @@ from dexorder.tools.workspace_tools import get_workspace_store
 from dexorder.tools.evaluate_indicator import evaluate_indicator
 from dexorder.tools.backtest_strategy import backtest_strategy
 from dexorder.tools.activate_strategy import activate_strategy, deactivate_strategy, list_active_strategies
 from dexorder.strategy.event_bridge import StrategyEventBridge
 from dexorder.strategy.lifecycle import get_strategy_lifecycle
 # =============================================================================
 # Global Data Directory
@@ -59,19 +62,34 @@ def get_data_dir() -> Path:
 # =============================================================================
-# Indicator Types Helpers
+# Category Types Helpers
 # =============================================================================
-def _build_indicator_type_entry(meta: dict) -> dict:
+def _type_store_name(category: str) -> str:
-    """Build an indicator_types workspace entry from indicator metadata dict."""
+    return f"{category}_types"
 def _type_store_key(category: str, name: str) -> str:
    sanitized = sanitize_name(name).lower()
    return f"custom_{sanitized}" if category == "indicator" else sanitized
 def _build_type_entry(category: str, meta: dict) -> dict:
    """Build a {category}_types workspace entry from a metadata dict."""
    name = meta.get('name', '')
-    pandas_ta_name = f"custom_{sanitize_name(name).lower()}"
+    key = _type_store_key(category, name)
    now = int(time.time())
-    return {
+    entry = {
-        'pandas_ta_name': pandas_ta_name,
+        'key': key,
        'display_name': name,
        'description': meta.get('description', ''),
-        'metadata': {
+        'metadata': {},
        'created_at': now,
        'modified_at': now,
    }
    if category == "indicator":
        entry['pandas_ta_name'] = key
        entry['metadata'] = {
            'display_name': name,
            'parameters': meta.get('parameters') or {},
            'input_series': meta.get('input_series') or ['close'],
@@ -79,31 +97,89 @@ def _build_indicator_type_entry(meta: dict) -> dict:
            'pane': meta.get('pane', 'separate'),
            'filled_areas': meta.get('filled_areas') or [],
            'bands': meta.get('bands') or [],
-        },
+        }
-        'created_at': now,
+    elif category == "strategy":
-        'modified_at': now,
+        entry['metadata'] = {
-    }
+            'data_feeds': meta.get('data_feeds') or [],
            'parameters': meta.get('parameters') or {},
        }
    # research: metadata stays empty (no fields beyond base)
    return entry
-def _upsert_indicator_type(workspace_store, category_manager, name: str) -> None:
+def _upsert_type(workspace_store, category_manager, category: str, name: str) -> None:
-    """Read indicator metadata from disk and upsert into indicator_types workspace store."""
+    """Read category metadata from disk and upsert into the {category}_types workspace store."""
-    read_result = category_manager.read('indicator', name)
+    read_result = category_manager.read(category, name)
    if not read_result.get('exists') or not read_result.get('metadata'):
        return
-    meta = read_result['metadata']
+    entry = _build_type_entry(category, read_result['metadata'])
-    entry = _build_indicator_type_entry(meta)
+    key = entry['key']
-    pandas_ta_name = entry['pandas_ta_name']
+    store = _type_store_name(category)
    # Preserve original created_at if already present
-    existing = workspace_store.read('indicator_types')
+    existing = workspace_store.read(store)
    existing_types = (existing.get('data') or {}).get('types') or {}
-    if pandas_ta_name in existing_types:
+    if key in existing_types:
-        entry['created_at'] = existing_types[pandas_ta_name].get('created_at', entry['created_at'])
+        entry['created_at'] = existing_types[key].get('created_at', entry['created_at'])
-    workspace_store.patch('indicator_types', [
+    workspace_store.patch(store, [{'op': 'add', 'path': f'/types/{key}', 'value': entry}])
-        {'op': 'add', 'path': f'/types/{pandas_ta_name}', 'value': entry}
+    logging.info(f"Upserted {store}/{key} for '{name}'")
-    ])
+
-    logging.info(f"Upserted indicator_types/{pandas_ta_name} for '{name}'")
+
 def _remove_type(workspace_store, category: str, name: str) -> None:
    """Remove a category item from the {category}_types workspace store."""
    key = _type_store_key(category, name)
    store = _type_store_name(category)
    try:
        workspace_store.patch(store, [{'op': 'remove', 'path': f'/types/{key}'}])
        logging.info(f"Removed {store}/{key} for '{name}'")
    except Exception:
        pass  # entry may not exist; that's fine
    if category == "indicator":
        _remove_indicator_instances(workspace_store, key)
 def _remove_indicator_instances(workspace_store, pandas_ta_name: str) -> None:
    """Remove all instances of a custom indicator from the indicators workspace store."""
    existing = workspace_store.read('indicators')
    instances = (existing.get('data') or {}).get('indicators') or {}
    to_remove = [inst_id for inst_id, inst in instances.items()
                 if inst.get('pandas_ta_name') == pandas_ta_name]
    if not to_remove:
        return
    patches = [{'op': 'remove', 'path': f'/indicators/{inst_id}'} for inst_id in to_remove]
    try:
        workspace_store.patch('indicators', patches)
        logging.info(f"Removed {len(to_remove)} instance(s) of {pandas_ta_name} from indicators store")
    except Exception:
        logging.warning(f"Failed to remove indicator instances for {pandas_ta_name}", exc_info=True)
 def _populate_types_from_disk(workspace_store, category_manager, category: str) -> None:
    """Scan existing category items and add any missing entries to the {category}_types store."""
    store = _type_store_name(category)
    existing = workspace_store.read(store)
    existing_types = (existing.get('data') or {}).get('types') or {}
    items = category_manager.list_items(category).get('items', [])
    added = 0
    for item in items:
        item_name = item.get('name', '')
        if not item_name:
            continue
        key = _type_store_key(category, item_name)
        if key not in existing_types:
            _upsert_type(workspace_store, category_manager, category, item_name)
            added += 1
    if added > 0:
        logging.info(f"Populated {added} {category} type(s) from disk into {store}")
 def _get_env_yml() -> Optional[Path]:
    """Return the path to environment.yml if it exists alongside main.py."""
    p = Path(__file__).parent / "environment.yml"
    return p if p.exists() else None
 def _populate_indicator_types_from_disk(workspace_store, category_manager) -> None:
@@ -226,8 +302,9 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
    category_manager = get_category_manager(config.data_dir)
    logging.info(f"Category manager initialized at {config.data_dir}")
-    # Populate indicator_types store from existing indicators on disk (migration/startup sync)
+    # Populate {category}_types stores from existing items on disk (migration/startup sync)
-    _populate_indicator_types_from_disk(workspace_store, category_manager)
+    for _cat in ("indicator", "strategy", "research"):
        _populate_types_from_disk(workspace_store, category_manager, _cat)
    @server.list_resources()
    async def list_resources():
@@ -503,6 +580,25 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                    "required": ["revision", "category", "name"]
                }
            ),
            Tool(
                name="python_delete",
                description="Delete a category script permanently. Commits removal to git history and removes any conda packages that are no longer needed.",
                inputSchema={
                    "type": "object",
                    "properties": {
                        "category": {
                            "type": "string",
                            "enum": ["strategy", "indicator", "research"],
                            "description": "Category of the script"
                        },
                        "name": {
                            "type": "string",
                            "description": "Display name of the item to delete"
                        }
                    },
                    "required": ["category", "name"]
                }
            ),
            Tool(
                name="conda_sync",
                description="Sync conda packages: scan all metadata, remove unused packages (excluding base environment)",
@@ -699,6 +795,77 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                    "required": []
                }
            ),
            Tool(
                name="get_backtest_results",
                description=(
                    "Retrieve stored backtest results for a strategy. "
                    "Returns the most recent backtest runs with summary stats, "
                    "extended statistics, trade list, and equity curve."
                ),
                inputSchema={
                    "type": "object",
                    "properties": {
                        "strategy_name": {
                            "type": "string",
                            "description": "Display name of the strategy"
                        },
                        "limit": {
                            "type": "integer",
                            "description": "Maximum number of backtest runs to return (default 5)",
                            "default": 5
                        }
                    },
                    "required": ["strategy_name"]
                }
            ),
            Tool(
                name="get_strategy_trades",
                description=(
                    "Retrieve the trade log for a strategy (live/paper or backtest). "
                    "Returns individual round-trip trades with entry/exit prices and PnL."
                ),
                inputSchema={
                    "type": "object",
                    "properties": {
                        "strategy_name": {
                            "type": "string",
                            "description": "Display name of the strategy"
                        },
                        "limit": {
                            "type": "integer",
                            "description": "Maximum number of trades to return (default 100)",
                            "default": 100
                        }
                    },
                    "required": ["strategy_name"]
                }
            ),
            Tool(
                name="get_strategy_events",
                description=(
                    "Retrieve the event log for a strategy "
                    "(PnL updates, fills, errors, status changes)."
                ),
                inputSchema={
                    "type": "object",
                    "properties": {
                        "strategy_name": {
                            "type": "string",
                            "description": "Display name of the strategy"
                        },
                        "event_type": {
                            "type": "string",
                            "description": "Filter by event type (optional): PNL_UPDATE, ORDER_FILLED, ERROR, etc."
                        },
                        "limit": {
                            "type": "integer",
                            "description": "Maximum number of events to return (default 50)",
                            "default": 50
                        }
                    },
                    "required": ["strategy_name"]
                }
            ),
        ]
@@ -734,7 +901,11 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                metadata=arguments.get("metadata")
            )
            content = []
-            meta_parts = [f"success: {result['success']}", f"path: {result['path']}"]
+            meta_parts = [f"success: {result['success']}"]
            if result.get('path'):
                meta_parts.append(f"path: {result['path']}")
            if result.get('error'):
                meta_parts.append(f"error: {result['error']}")
            if result.get("revision"):
                meta_parts.append(f"revision: {result['revision']}")
            if result.get("validation") and not result["validation"].get("success"):
@@ -747,8 +918,9 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                logging.info(f"python_write '{arguments.get('name')}': returning {len(content)} items, {image_count} images")
            else:
                logging.info(f"python_write '{arguments.get('name')}': no execution result (category={arguments.get('category')})")
-            if result.get("success") and arguments.get("category") == "indicator":
+            if result.get("success"):
-                _upsert_indicator_type(workspace_store, category_manager, arguments.get("name", ""))
+                _upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
                cleanup_extra_packages(get_data_dir(), _get_env_yml())
            return content
        elif name == "python_edit":
            result = category_manager.edit(
@@ -760,7 +932,11 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                metadata=arguments.get("metadata")
            )
            content = []
-            meta_parts = [f"success: {result['success']}", f"path: {result['path']}"]
+            meta_parts = [f"success: {result['success']}"]
            if result.get('path'):
                meta_parts.append(f"path: {result['path']}")
            if result.get('error'):
                meta_parts.append(f"error: {result['error']}")
            if result.get("revision"):
                meta_parts.append(f"revision: {result['revision']}")
            if result.get("validation") and not result["validation"].get("success"):
@@ -773,8 +949,9 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                logging.info(f"python_edit '{arguments.get('name')}': returning {len(content)} items, {image_count} images")
            else:
                logging.info(f"python_edit '{arguments.get('name')}': no execution result")
-            if result.get("success") and arguments.get("category") == "indicator":
+            if result.get("success"):
-                _upsert_indicator_type(workspace_store, category_manager, arguments.get("name", ""))
+                _upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
                cleanup_extra_packages(get_data_dir(), _get_env_yml())
            return content
        elif name == "python_read":
            return category_manager.read(
@@ -808,13 +985,28 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                meta_parts.append(f"error: {result['error']}")
            if result.get("validation") and not result["validation"].get("success"):
                meta_parts.append(f"validation errors: {result['validation'].get('errors', [])}")
            if result.get("success"):
                _upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
            return [TextContent(type="text", text="\n".join(meta_parts))]
        elif name == "python_delete":
            result = category_manager.delete(
                category=arguments.get("category", ""),
                name=arguments.get("name", "")
            )
            if result.get("success"):
                _remove_type(workspace_store, arguments.get("category", ""), arguments.get("name", ""))
                cleanup_result = cleanup_extra_packages(get_data_dir(), _get_env_yml())
                if cleanup_result.get("removed"):
                    result["packages_removed"] = cleanup_result["removed"]
            parts = [f"success: {result['success']}"]
            for k in ("category", "name", "revision", "packages_removed", "error"):
                if result.get(k):
                    parts.append(f"{k}: {result[k]}")
            return [TextContent(type="text", text="\n".join(parts))]
        elif name == "conda_sync":
            # Get environment.yml path relative to main.py
            env_yml = Path(__file__).parent / "environment.yml"
            return sync_packages(
                data_dir=get_data_dir(),
-                environment_yml=env_yml if env_yml.exists() else None
+                environment_yml=_get_env_yml()
            )
        elif name == "conda_install":
            return install_packages(arguments.get("packages", []))
@@ -837,7 +1029,7 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                parameters=arguments.get("parameters") or {},
            )
        elif name == "backtest_strategy":
-            return await backtest_strategy(
+            result = await backtest_strategy(
                strategy_name=arguments.get("strategy_name", ""),
                feeds=arguments.get("feeds", []),
                from_time=arguments.get("from_time"),
@@ -845,6 +1037,26 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
                initial_capital=float(arguments.get("initial_capital", 10_000.0)),
                paper=bool(arguments.get("paper", True)),
            )
            # Persist backtest to DB (non-fatal)
            try:
                payload = json.loads(result[0].text) if result and isinstance(result[0], TextContent) else {}
                if payload and "summary" in payload:
                    from dexorder.strategy.db import get_strategy_db
                    db = get_strategy_db(get_data_dir())
                    await db.insert_backtest(
                        strategy_name=arguments.get("strategy_name", ""),
                        from_time=arguments.get("from_time"),
                        to_time=arguments.get("to_time"),
                        initial_capital=float(arguments.get("initial_capital", 10_000.0)),
                        feeds=arguments.get("feeds", []),
                        summary=payload.get("summary", {}),
                        statistics=payload.get("statistics", {}),
                        trades=payload.get("trades", []),
                        equity_curve=payload.get("equity_curve", []),
                    )
            except Exception as _e:
                logging.debug("Failed to persist backtest results: %s", _e)
            return result
        elif name == "activate_strategy":
            return await activate_strategy(
                strategy_name=arguments.get("strategy_name", ""),
@@ -858,6 +1070,31 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
            )
        elif name == "list_active_strategies":
            return await list_active_strategies()
        elif name == "get_backtest_results":
            from dexorder.strategy.db import get_strategy_db
            db = get_strategy_db(get_data_dir())
            results = await db.get_backtests(
                strategy_name=arguments.get("strategy_name", ""),
                limit=int(arguments.get("limit", 5)),
            )
            return [TextContent(type="text", text=json.dumps({"backtest_runs": results}))]
        elif name == "get_strategy_trades":
            from dexorder.strategy.db import get_strategy_db
            db = get_strategy_db(get_data_dir())
            trades = await db.get_trades(
                strategy_name=arguments.get("strategy_name", ""),
                limit=int(arguments.get("limit", 100)),
            )
            return [TextContent(type="text", text=json.dumps({"trades": trades}))]
        elif name == "get_strategy_events":
            from dexorder.strategy.db import get_strategy_db
            db = get_strategy_db(get_data_dir())
            events = await db.get_events(
                strategy_name=arguments.get("strategy_name", ""),
                event_type=arguments.get("event_type"),
                limit=int(arguments.get("limit", 50)),
            )
            return [TextContent(type="text", text=json.dumps({"events": events}))]
        else:
            raise ValueError(f"Unknown tool: {name}")
@@ -909,6 +1146,7 @@ class UserContainer:
        self.event_publisher: Optional[EventPublisher] = None
        self.mcp_server: Optional[Server] = None
        self.data_api: Optional[DataAPIImpl] = None
        self.event_bridge: Optional[StrategyEventBridge] = None
        self.running = False
    async def start(self) -> None:
@@ -933,6 +1171,7 @@ class UserContainer:
            s3_endpoint=s3_cfg.get("s3_endpoint") or secrets.get("s3_endpoint"),
            s3_access_key=s3_cfg.get("s3_access_key") or secrets.get("s3_access_key"),
            s3_secret_key=s3_cfg.get("s3_secret_key") or secrets.get("s3_secret_key"),
            s3_region=s3_cfg.get("s3_region") or secrets.get("s3_region"),
        )
        await self.data_api.start()
        set_api(API(charting=ChartingAPIImpl(), data=self.data_api))
@@ -965,6 +1204,23 @@ class UserContainer:
            delivery=DeliverySpec.active_or_telegram(),
        ))
        # Initialize strategy lifecycle manager (sets up DB + worktrees dir)
        strategy_lifecycle = get_strategy_lifecycle(self.config.data_dir)
        await strategy_lifecycle.initialize()
        # Start strategy event bridge (PULL socket for subprocess events)
        self.event_bridge = StrategyEventBridge(
            event_publisher=self.event_publisher,
            strategy_lifecycle=strategy_lifecycle,
        )
        await self.event_bridge.start()
        strategy_lifecycle._bridge = self.event_bridge
        strategy_lifecycle._lifecycle = get_lifecycle_manager()
        logging.info("Strategy event bridge started")
        # Resume any strategies that were running before container restart
        await strategy_lifecycle.resume_running()
        # Create MCP server
        self.mcp_server = create_mcp_server(self.config, self.event_publisher)
@@ -998,6 +1254,20 @@ class UserContainer:
                delivery=DeliverySpec.active_or_telegram(),
            ))
        # Stop running strategies gracefully
        try:
            from dexorder.strategy.lifecycle import get_strategy_lifecycle
            strategy_lifecycle = get_strategy_lifecycle()
            await strategy_lifecycle.shutdown()
            logging.info("Strategy lifecycle manager stopped")
        except Exception as e:
            logging.warning("Error stopping strategy lifecycle: %s", e)
        # Stop event bridge
        if self.event_bridge:
            await self.event_bridge.stop()
            logging.info("Strategy event bridge stopped")
        # Stop subsystems
        if self.data_api:
            await self.data_api.stop()
--- a/sandbox/setup.py
+++ b/sandbox/setup.py
@@ -1,30 +0,0 @@
 from setuptools import setup, find_packages
 setup(
    name="dexorder-sandbox",
    version="0.1.0",
    description="Dexorder Trading Platform Sandbox",
    packages=find_packages(),
    python_requires=">=3.9",
    install_requires=[
        "pyiceberg>=0.6.0",
        "pyarrow>=14.0.0",
        "pandas>=2.0.0",
        "pyzmq>=25.0.0",
        "protobuf>=4.25.0",
        "pyyaml>=6.0",
        "aiofiles>=23.0.0",
        "mcp>=1.0.0",
        "jsonpatch>=1.33",
        "starlette>=0.27.0",
        "uvicorn>=0.27.0",
        "sse-starlette>=1.6.0",
        "matplotlib>=3.7.0",
    ],
    extras_require={
        "dev": [
            "pytest>=7.0.0",
            "pytest-asyncio>=0.21.0",
        ]
    },
 )
--- a/web/src/components/ChatPanel.vue
+++ b/web/src/components/ChatPanel.vue
@@ -59,6 +59,18 @@ const addToolCallBubble = (label: string) => {
  }]
 }
 const appendToolCallStatus = (status: string) => {
  if (!toolCallMessageId) return
  const idx = messages.value.findIndex(m => m._id === toolCallMessageId)
  if (idx !== -1) {
    messages.value[idx] = {
      ...messages.value[idx],
      content: messages.value[idx].content + `\n↳ ${status}`
    }
    messages.value = [...messages.value]
  }
 }
 const removeToolCallBubble = () => {
  if (toolCallMessageId) {
    messages.value = messages.value.filter(m => m._id !== toolCallMessageId)
@@ -76,11 +88,47 @@ const streamingImages = ref<any[]>([])
 const handleMessage = (data: WebSocketMessage) => {
  console.log('[ChatPanel] Received message:', data)
  if (data.type === 'conversation_history') {
    messages.value = (data.messages as any[]).map((m: any) => {
      const ts = new Date(m.timestamp / 1000) // microseconds → ms
      const files = (m.files ?? []).map((b: any) => ({
        name: `image_${b.id}.png`,
        size: 0,
        type: b.mimeType.split('/')[1] ?? 'png',
        url: `data:${b.mimeType};base64,${b.data}`,
        preview: `data:${b.mimeType};base64,${b.data}`,
      }))
      return {
        _id: m.id,
        content: m.content,
        senderId: m.role === 'user' ? CURRENT_USER_ID : AGENT_ID,
        timestamp: ts.toTimeString().split(' ')[0].slice(0, 5),
        date: ts.toLocaleDateString(),
        saved: true,
        distributed: true,
        seen: true,
        files,
      }
    })
    messagesLoaded.value = true
    return
  }
  if (data.type === 'agent_tool_call') {
    addToolCallBubble(data.label ?? data.toolName ?? 'Tool call...')
    return
  }
  if (data.type === 'subagent_tool_call') {
    appendToolCallStatus(data.toolName ?? data.label ?? 'tool')
    return
  }
  if (data.type === 'subagent_chunk') {
    // Subagent final text — not shown separately; the main agent will incorporate it in its response
    return
  }
  if (data.type === 'image') {
    // Handle image message - attach to current streaming message or create standalone
    console.log('[ChatPanel] Processing image message')
--- a/web/src/composables/useStateSync.ts
+++ b/web/src/composables/useStateSync.ts
@@ -3,6 +3,24 @@ import * as jsonpatch from 'fast-json-patch';
 import type { BackendMessage, FrontendMessage, HelloMessage, PatchMessage } from '../types/sync';
 import { wsManager } from './useWebSocket';
 function deepReplace(target: Record<string, any>, source: Record<string, any>) {
  for (const key of Object.keys(target)) {
    if (!(key in source)) {
      delete target[key]
    }
  }
  for (const [key, value] of Object.entries(source)) {
    if (
      value !== null && typeof value === 'object' && !Array.isArray(value) &&
      target[key] !== null && typeof target[key] === 'object' && !Array.isArray(target[key])
    ) {
      deepReplace(target[key], value)
    } else {
      target[key] = value
    }
  }
 }
 export function useStateSync(stores: Record<string, Store>) {
  console.log('[StateSync] Initializing with stores:', Object.keys(stores));
@@ -35,7 +53,7 @@ export function useStateSync(stores: Record<string, Store>) {
      if (store) {
        console.log('[StateSync] Applying snapshot state:', msg.state);
        isApplyingBackendPatch[msg.store] = true;
-        store.$patch(msg.state);
+        store.$patch((state) => deepReplace(state as Record<string, any>, msg.state as Record<string, any>));
        // Update previousState to stay in sync
        previousStates[msg.store] = JSON.parse(JSON.stringify(store.$state));
        isApplyingBackendPatch[msg.store] = false;
@@ -64,7 +82,7 @@ export function useStateSync(stores: Record<string, Store>) {
        const newState = jsonpatch.applyPatch(currentState, msg.patch, false, false).newDocument;
        console.log('[StateSync] New state after patch:', newState);
        isApplyingBackendPatch[msg.store] = true;
-        store.$patch(newState);
+        store.$patch((state) => deepReplace(state as Record<string, any>, newState as Record<string, any>));
        // Update previousState to stay in sync
        previousStates[msg.store] = JSON.parse(JSON.stringify(store.$state));
        isApplyingBackendPatch[msg.store] = false;
--- a/web/src/composables/useWebSocket.ts
+++ b/web/src/composables/useWebSocket.ts
@@ -123,8 +123,9 @@ class WebSocketManager {
        this.statusMessage.value = ''
        console.log('WebSocket disconnected:', event.code, event.reason)
-        // Attempt to reconnect if we have a token
+        // Attempt to reconnect if we have a token and it wasn't an intentional close.
-        if (this.token && !event.wasClean) {
+        // Check code instead of wasClean: code 1005 has wasClean=true but still needs retry.
        if (this.token && event.code !== 1000 && event.code !== 1001) {
          this.scheduleReconnect()
        }
      }