bugfixes; research subproc; higher sandbox limits

This commit is contained in:
2026-04-16 18:11:26 -04:00
parent f80c943dc3
commit 3153e89d4f
54 changed files with 1947 additions and 498 deletions

View File

@@ -204,32 +204,17 @@ if [ -z "$USER_ID" ]; then
fi
echo -e "${GREEN}User ID: $USER_ID${NC}"
# Build license JSON based on type
case "$LICENSE_TYPE" in
enterprise)
LICENSE_JSON='{"licenseType":"enterprise","features":{"maxIndicators":200,"maxStrategies":100,"maxBacktestDays":1825,"realtimeData":true,"customExecutors":true,"apiAccess":true},"resourceLimits":{"maxConcurrentSessions":20,"maxMessagesPerDay":10000,"maxTokensPerMessage":32768,"rateLimitPerMinute":300},"k8sResources":{"memoryRequest":"1Gi","memoryLimit":"4Gi","cpuRequest":"500m","cpuLimit":"4000m","storage":"50Gi","tmpSizeLimit":"1Gi","enableIdleShutdown":true,"idleTimeoutMinutes":120},"preferredModel":{"provider":"anthropic","model":"claude-opus-4-6","temperature":0.7}}'
;;
free)
LICENSE_JSON='{"licenseType":"free","features":{"maxIndicators":10,"maxStrategies":3,"maxBacktestDays":30,"realtimeData":false,"customExecutors":false,"apiAccess":false},"resourceLimits":{"maxConcurrentSessions":1,"maxMessagesPerDay":100,"maxTokensPerMessage":4096,"rateLimitPerMinute":20},"k8sResources":{"memoryRequest":"256Mi","memoryLimit":"512Mi","cpuRequest":"100m","cpuLimit":"500m","storage":"2Gi","tmpSizeLimit":"128Mi","enableIdleShutdown":true,"idleTimeoutMinutes":30},"preferredModel":{"provider":"anthropic","model":"claude-haiku-4-5-20251001","temperature":0.7}}'
;;
pro|*)
LICENSE_JSON='{"licenseType":"pro","features":{"maxIndicators":50,"maxStrategies":20,"maxBacktestDays":365,"realtimeData":true,"customExecutors":true,"apiAccess":true},"resourceLimits":{"maxConcurrentSessions":5,"maxMessagesPerDay":1000,"maxTokensPerMessage":8192,"rateLimitPerMinute":60},"k8sResources":{"memoryRequest":"512Mi","memoryLimit":"2Gi","cpuRequest":"250m","cpuLimit":"2000m","storage":"10Gi","tmpSizeLimit":"256Mi","enableIdleShutdown":true,"idleTimeoutMinutes":60},"preferredModel":{"provider":"anthropic","model":"claude-sonnet-4-6","temperature":0.7}}'
;;
esac
echo -e "${GREEN}→${NC} Creating $LICENSE_TYPE license..."
$KUBECTL exec "$PG_POD" -- psql -U postgres -d iceberg -c "
INSERT INTO user_licenses (user_id, email, license, mcp_server_url)
VALUES (
'$USER_ID',
'$USER_EMAIL',
'$LICENSE_JSON',
'$MCP_URL'
)
ON CONFLICT (user_id) DO UPDATE SET
license = EXCLUDED.license,
updated_at = NOW();
" > /dev/null
echo -e "${GREEN}→${NC} Setting $LICENSE_TYPE license..."
HTTP_CODE=$(curl -s -o /tmp/dexorder-set-tier-response.json -w "%{http_code}" \
-X POST "$BASE_URL/api/admin/users/$USER_ID/set-tier" \
-H "Content-Type: application/json" \
-d "{\"tier\": \"$LICENSE_TYPE\"}")
if [[ "$HTTP_CODE" != "200" ]]; then
echo -e "${RED}✗ Failed to set license tier (HTTP $HTTP_CODE)${NC}"
cat /tmp/dexorder-set-tier-response.json 2>/dev/null
exit 1
fi
rm -f /tmp/dexorder-set-tier-response.json
echo -e "${GREEN}✓ User ready: $USER_EMAIL ($LICENSE_TYPE)${NC}"
echo ""

View File

@@ -1,6 +1,6 @@
# RBAC for gateway to CREATE sandbox deployments only
# Principle of least privilege: gateway can ONLY create deployments/services/PVCs
# in the sandbox namespace. Deletion is handled by the lifecycle sidecar.
# RBAC for gateway to manage sandbox deployments
# Principle of least privilege: gateway can create/delete deployments in the
# sandbox namespace. PVC deletion is still handled by the lifecycle sidecar.
# No pods, secrets, exec, or cross-namespace access.
---
apiVersion: v1
@@ -15,10 +15,10 @@ metadata:
name: sandbox-creator
namespace: sandbox
rules:
# Deployments: create and read only (deletion handled by sidecar)
# Deployments: full management (delete used for license tier changes; PVC deletion still via sidecar)
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["create", "get", "list", "watch", "patch", "update"]
verbs: ["create", "get", "list", "watch", "patch", "update", "delete"]
# PVCs: create and read (deletion handled by sidecar)
- apiGroups: [""]
@@ -41,7 +41,6 @@ rules:
verbs: ["get"]
# Explicitly NOT included:
# - deployments/delete - handled by lifecycle sidecar
# - pvc/delete - handled by lifecycle sidecar
# - services/delete - handled by lifecycle sidecar
# - pods (create/delete) - must go through deployments

View File

@@ -83,10 +83,10 @@ spec:
resources:
requests:
memory: "256Mi"
memory: "512Mi"
cpu: "100m"
limits:
memory: "512Mi"
memory: "2Gi"
cpu: "500m"
livenessProbe:

View File

@@ -19,8 +19,8 @@ spec:
cpu: "100m"
# Maximum any single container can request
max:
memory: "2Gi"
cpu: "2000m"
memory: "8Gi"
cpu: "4000m"
min:
memory: "32Mi"
cpu: "10m"

View File

@@ -4,18 +4,32 @@
flink_hostname: flink-jobmanager
ingestor_broker_port: 5567
# Supported exchanges (subscribe to these prefixes)
# Supported exchanges (used for symbol metadata generation)
supported_exchanges:
- BINANCE
- COINBASE
- KRAKEN
# Per-exchange work slot capacity.
# Each slot is one concurrent job. historical_slots limits parallel OHLC fetches;
# realtime_slots limits concurrent tick subscriptions. Set based on exchange rate
# limits and connection constraints — these are conservative starting values.
exchange_capacity:
BINANCE:
historical_slots: 1
realtime_slots: 5
COINBASE:
historical_slots: 1
realtime_slots: 4
KRAKEN:
historical_slots: 1
realtime_slots: 3
# Kafka configuration
kafka_brokers:
- kafka:9092
# Worker configuration
max_concurrent: 10
poll_interval_ms: 10000
# Logging

View File

@@ -46,6 +46,11 @@ data:
alerts:
max_active: 100
# Memory guard: soft RLIMIT_AS limit as a fraction of the cgroup memory.max.
# Set below 1.0 so Python raises MemoryError before the kernel OOM-kills the pod.
memory:
limit_fraction: 0.85
# Logging
logging:
level: "INFO"

10
doc/plan.md Normal file
View File

@@ -0,0 +1,10 @@
# Development Plan
* Realtime data
* Triggers
* Strategy UI
* Backtesting TV integration
* Paper Trading
* User secrets
* Live Execution
* Sandbox <=> Dexorder auth

139
doc/prod_deployment.md Normal file
View File

@@ -0,0 +1,139 @@
# Production Deployment Guide
This document describes the full process for deploying the AI platform to the production Kubernetes cluster, including the special steps required when the Iceberg schema has changed.
## Overview
The production cluster runs under `kubectl --context prod`, defaulting to the `ai` namespace. The `sandbox` namespace is shared between dev and prod.
Deployment consists of two parts:
1. **Standard deploy** — rebuild and push all images, apply k8s manifests, roll out services
2. **Iceberg schema wipe** *(when schema has changed)* — clear both the Iceberg REST catalog (postgres) and the MinIO data warehouse before deploying
---
## Standard Deployment (no schema changes)
```bash
bin/deploy-all --sandboxes
```
This script (hardcoded to `--context=prod`) performs:
1. Applies base kustomize manifests (`deploy/k8s/prod/`) — namespaces, RBAC, policies
2. Applies `deploy/k8s/prod/infrastructure.yaml` — statefulsets, deployments
3. Runs `bin/config-update prod` — updates ConfigMaps
4. Builds and pushes images for all 7 services: `gateway`, `web`, `sandbox`, `lifecycle-sidecar`, `flink`, `relay`, `ingestor`
5. *(with `--sandboxes`)* Deletes sandbox Deployments and Services in the `sandbox` namespace (PVCs are retained; gateway recreates them on next login)
6. Waits for rollouts on all 6 main deployments
> **Secrets are NOT updated by this script.** Run `bin/secret-update prod` separately if secrets have changed.
---
## Full Deploy with Iceberg Schema Wipe
Use this when the Iceberg table schema has changed (e.g. protobuf/column changes in the `trading.ohlc` table).
### Architecture note
The Iceberg REST catalog uses **two storage layers** that must both be cleared:
| Layer | What it stores | How to clear |
|---|---|---|
| PostgreSQL `iceberg` database | Table/namespace metadata (catalog) | Drop and recreate the database |
| MinIO `warehouse` bucket | Parquet data files | `mc rm --recursive --force` |
**Important:** The gateway also uses the `iceberg` postgres database for its own auth tables (`user`, `user_licenses`, `session`, etc.). Wiping the database removes all user accounts. After the wipe, the schema must be re-applied and users recreated.
### Step-by-step
#### 1. Scale down Iceberg consumers
```bash
kubectl --context prod -n ai scale deployment iceberg-catalog flink-jobmanager flink-taskmanager --replicas=0
```
This prevents in-flight writes during the wipe.
#### 2. Wipe the Iceberg PostgreSQL catalog
```bash
kubectl --context prod -n ai exec postgres-0 -- psql -U postgres -c "DROP DATABASE iceberg;"
kubectl --context prod -n ai exec postgres-0 -- psql -U postgres -c "CREATE DATABASE iceberg;"
```
#### 3. Wipe the MinIO warehouse bucket
Get MinIO credentials from the cluster secret:
```bash
kubectl --context prod -n ai get secret minio-secret -o jsonpath='{.data.root-user}' | base64 -d
kubectl --context prod -n ai get secret minio-secret -o jsonpath='{.data.root-password}' | base64 -d
```
Configure the `mc` client inside the MinIO pod and remove all objects:
```bash
kubectl --context prod -n ai exec minio-0 -- mc alias set local http://localhost:9000 <user> <password>
kubectl --context prod -n ai exec minio-0 -- mc rm --recursive --force local/warehouse/
```
#### 4. Run the full deploy
```bash
bin/deploy-all --sandboxes
```
This rebuilds and redeploys all services, including `iceberg-catalog`, `flink-jobmanager`, and `flink-taskmanager` (which were scaled to zero above — `deploy-all` will restore them to their manifest replica counts).
#### 5. Re-apply the gateway database schema
The gateway does **not** auto-migrate. After the `iceberg` database is recreated, the schema must be applied manually:
```bash
kubectl --context prod -n ai exec -i postgres-0 -- psql -U postgres -d iceberg < gateway/schema.sql
```
This creates the `user`, `session`, `user_licenses`, and related tables.
#### 6. Recreate all users
```bash
bin/create-all-users prod
```
This registers all alpha test users via the gateway API and assigns their licenses. Users are defined in the script itself (`bin/create-all-users`).
To add or modify users, edit that file or run `bin/create-user prod` interactively.
---
## Verification
```bash
curl -I https://dexorder.ai/api/health
```
Check gateway logs for errors:
```bash
kubectl --context prod -n ai logs deployment/gateway --tail=100
```
---
## Common Issues
### Login fails after Iceberg wipe
**Symptom:** `Sign in failed` (401) or `User creation failed` (postgres error `42P01: undefined table`)
**Cause:** Dropping the `iceberg` database removes the gateway's auth tables along with the Iceberg catalog metadata — they share the same database.
**Fix:** Re-apply the schema and recreate users (steps 5 and 6 above).
### Gateway shows `42P01` errors but pod is running
The gateway does not auto-migrate on startup. The schema file must be applied manually after any database recreation. A gateway restart alone will not fix this.

View File

@@ -81,18 +81,29 @@ All sockets bind on **Relay** (well-known endpoint). Components connect to relay
- Relay publishes DataRequest to ingestor work queue
- No request tracking - relay is stateless
### 2. Ingestor Work Queue (Relay → Ingestors)
**Pattern**: PUB/SUB with exchange prefix filtering
- **Socket Type**: Relay uses PUB (bind), Ingestors use SUB (connect)
- **Endpoint**: `tcp://*:5555` (Relay binds)
- **Message Types**: `DataRequest` (historical or realtime)
- **Topic Prefix**: Market name (e.g., `BTC/USDT.`, `ETH/BTC.`)
- **Behavior**:
- Relay publishes work with exchange prefix from ticker
- Ingestors subscribe only to exchanges they support
- Multiple ingestors can compete for same exchange
- Ingestors write data to Kafka only (no direct response)
- Flink processes Kafka → Iceberg → notification
### 2. Ingestor Work Queue (Flink ↔ Ingestors)
**Pattern**: ROUTER/DEALER slot-based broker
- **Socket Type**: Flink `IngestorBroker` uses ROUTER (bind), Ingestors use DEALER (connect)
- **Endpoint**: `tcp://*:5567` (Flink binds)
- **Message Types**: `WorkerReady` (slot offer), `DataRequest` (work assignment), `WorkComplete`, `WorkHeartbeat`, `WorkReject`, `WorkStop`
- **Capacity model**:
- Each `WorkerReady` (0x20) is ONE slot offer for one exchange and one job type (`SlotType`: `HISTORICAL=1`, `REALTIME=2`, `ANY=0`)
- Ingestors send N `WorkerReady` messages at startup — one per available slot per exchange per type
- Flink dispatches a job by matching the slot's exchange and SlotType to the request
- The slot is consumed on dispatch; the ingestor re-offers it (new `WorkerReady`) when the job ends
- Rate-limit backoff: if the exchange returns a 429, the ingestor delays the re-offer by the `Retry-After` duration from the response header
- **Historical job lifecycle**:
- Flink dispatches `DataRequest` (HISTORICAL_OHLC) → ingestor fetches and writes to Kafka → sends `WorkComplete` (0x21) → sends new `WorkerReady` for that slot
- **Realtime job lifecycle**:
- Flink dispatches `DataRequest` (REALTIME_TICKS) → ingestor polls exchange and writes ticks to Kafka → sends `WorkHeartbeat` (0x22) every 5 s → on `WorkStop` (0x25) from Flink: cancels and sends new `WorkerReady`
- **Slot configuration** (per ingestor, per exchange):
```yaml
exchange_capacity:
BINANCE: { historical_slots: 3, realtime_slots: 5 }
KRAKEN: { historical_slots: 2, realtime_slots: 3 }
COINBASE: { historical_slots: 2, realtime_slots: 4 }
```
- **Flink restart**: when Flink restarts its `freeSlots` deque is cleared; all in-flight jobs time out on the ingestor side, releasing their slots, which then re-offer via `WorkerReady`
### 3. Market Data Fanout (Relay ↔ Flink ↔ Clients)
**Pattern**: XPUB/XSUB proxy

View File

@@ -1,4 +1 @@
what conclusions can you make by analyzing historical data on ETH price direction changes near market session overlaps and market sessions changes on monday and tuesday?
---

View File

@@ -3,6 +3,7 @@ package com.dexorder.flink.ingestor;
import com.dexorder.flink.zmq.ZmqChannelManager;
import com.dexorder.proto.DataRequest;
import com.dexorder.proto.RealtimeParams;
import com.dexorder.proto.SlotType;
import com.dexorder.proto.SubmitHistoricalRequest;
import com.dexorder.proto.WorkComplete;
import com.dexorder.proto.WorkHeartbeat;
@@ -17,27 +18,27 @@ import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
/**
* LRU-style work broker for ingestors.
* Slot-based work broker for ingestors.
*
* Ingestors connect via DEALER to the ROUTER socket on port 5567. They register with READY,
* are dispatched WORK messages, and respond with COMPLETE (historical) or HEARTBEAT (realtime).
* If a heartbeat times out the job is re-queued and dispatched to another available worker.
* Each WorkerReady message from an ingestor represents ONE available slot for a
* specific exchange and job type (HISTORICAL or REALTIME). Flink consumes the slot
* by dispatching a DataRequest to it. The ingestor re-offers the slot (sends another
* WorkerReady) once the job completes, subject to any rate-limit backoff.
*
* Also receives SubmitHistoricalRequest messages forwarded by the relay on the PULL socket (5566).
* Also receives SubmitHistoricalRequest messages forwarded by the relay on the PULL
* socket (5566), and realtime job requests from RealtimeSubscriptionManager.
*
* Message type IDs (ZMQ framing, not Kafka):
* 0x10 SubmitHistoricalRequest (relay → Flink via PULL, same as client wire type)
* 0x20 WorkerReady (ingestor → Flink)
* Message type IDs (ZMQ framing):
* 0x10 SubmitHistoricalRequest (relay → Flink via PULL)
* 0x20 WorkerReady (ingestor → Flink: one slot offer)
* 0x21 WorkComplete (ingestor → Flink)
* 0x22 WorkHeartbeat (ingestor → Flink)
* 0x23 WorkReject (ingestor → Flink)
@@ -53,7 +54,7 @@ public class IngestorBroker implements AutoCloseable {
private static final byte MSG_TYPE_WORK_COMPLETE = 0x21;
private static final byte MSG_TYPE_WORK_HEARTBEAT = 0x22;
private static final byte MSG_TYPE_WORK_REJECT = 0x23;
private static final byte MSG_TYPE_WORK_ASSIGN = 0x01; // DataRequest type on wire
private static final byte MSG_TYPE_WORK_ASSIGN = 0x01;
private static final byte MSG_TYPE_WORK_STOP = 0x25;
/** Re-queue realtime job if no heartbeat received within this window (ms) */
@@ -65,20 +66,20 @@ public class IngestorBroker implements AutoCloseable {
private volatile boolean running;
private Thread brokerThread;
// ── Worker tracking ──────────────────────────────────────────────────────
// ── Slot tracking ─────────────────────────────────────────────────────────
/** Workers ready to accept a job, in LRU order (head = least recently used) */
private final Deque<WorkerInfo> freeWorkers = new ArrayDeque<>();
/**
* Available slots, in LRU order (head = least recently used).
* Each entry is one WorkerReady slot offer from an ingestor.
*/
private final Deque<WorkerSlot> freeSlots = new ArrayDeque<>();
/** Jobs waiting for a compatible free worker */
/** Jobs waiting for a compatible free slot */
private final Queue<DataRequest> pendingJobs = new ArrayDeque<>();
/** Jobs currently executing on a worker */
/** Jobs currently executing on a slot */
private final Map<String, ActiveJob> activeJobs = new ConcurrentHashMap<>();
/** Worker identity → supported exchanges (set once on READY) */
private final Map<String, WorkerInfo> knownWorkers = new ConcurrentHashMap<>();
// ── Thread-safe inbound queue from RealtimeSubscriptionManager ───────────
private final Queue<DataRequest> externalSubmissions = new ConcurrentLinkedQueue<>();
@@ -134,8 +135,7 @@ public class IngestorBroker implements AutoCloseable {
/**
* Stop all realtime jobs for a ticker (called when last subscriber leaves).
* Thread-safe — posts a stop marker via externalSubmissions is complex; instead we
* directly find and stop active jobs. Protected by ConcurrentHashMap.
* Thread-safe via ConcurrentHashMap.
*/
public void stopRealtimeJobsForTicker(String ticker) {
List<String> toStop = new ArrayList<>();
@@ -154,7 +154,7 @@ public class IngestorBroker implements AutoCloseable {
}
}
// ── Broker loop ──────────────────────────────────────────────────────────
// ── Broker loop ──────────────────────────────────────────────────────────
private void brokerLoop() {
ZMQ.Socket pullSocket = zmqManager.getSocket(ZmqChannelManager.Channel.CLIENT_REQUEST);
@@ -174,18 +174,15 @@ public class IngestorBroker implements AutoCloseable {
enqueueJob(ext);
}
// Poll sockets (100ms timeout)
poller.poll(100);
if (poller.pollin(0)) {
handleClientRequest(pullSocket);
}
if (poller.pollin(1)) {
handleWorkerMessage(routerSocket);
}
// Check for heartbeat / completion timeouts
checkTimeouts();
} catch (Exception e) {
@@ -235,7 +232,8 @@ public class IngestorBroker implements AutoCloseable {
.setClientId(req.hasClientId() ? req.getClientId() : "")
.build();
enqueueJob(dataRequest);
LOG.info("Received historical request from relay: request_id={}, ticker={}", req.getRequestId(), req.getTicker());
LOG.info("Received historical request from relay: request_id={}, ticker={}",
req.getRequestId(), req.getTicker());
} catch (Exception e) {
LOG.error("Failed to parse SubmitHistoricalRequest from relay", e);
}
@@ -277,23 +275,28 @@ public class IngestorBroker implements AutoCloseable {
}
}
/**
* A WorkerReady message represents ONE slot offer for one exchange and job type.
* Add it directly to freeSlots — no deduplication (multiple slots per ingestor are expected).
*/
private void handleWorkerReady(byte[] identity, String identityKey, byte[] payload) throws Exception {
WorkerReady ready = WorkerReady.parseFrom(payload);
Set<String> exchanges = new HashSet<>(ready.getExchangesList());
SlotType slotType = ready.getJobType();
WorkerInfo worker = knownWorkers.computeIfAbsent(identityKey,
k -> new WorkerInfo(identity, identityKey, exchanges));
worker.exchanges = exchanges; // update in case re-READY with different config
worker.identity = identity;
if (!freeWorkers.contains(worker)) {
freeWorkers.addLast(worker);
for (String exchange : ready.getExchangesList()) {
WorkerSlot slot = new WorkerSlot(identity, identityKey, exchange.toUpperCase(), slotType);
freeSlots.addLast(slot);
LOG.info("Worker slot READY: id={}, exchange={}, type={}, totalFreeSlots={}",
identityKey, exchange, slotType, freeSlots.size());
}
LOG.info("Ingestor READY: id={}, exchanges={}, freeWorkers={}", identityKey, exchanges, freeWorkers.size());
dispatchPending();
}
/**
* Historical job completed. Remove from activeJobs.
* The ingestor will send a new typed WorkerReady to re-offer the slot.
*/
private void handleWorkComplete(String identityKey, byte[] payload) throws Exception {
WorkComplete complete = WorkComplete.parseFrom(payload);
String jobId = complete.getJobId();
@@ -304,13 +307,7 @@ public class IngestorBroker implements AutoCloseable {
} else {
LOG.info("Job COMPLETE: jobId={}, ticker={}, success={}", jobId, job.ticker, complete.getSuccess());
}
// Worker is free again
WorkerInfo worker = knownWorkers.get(identityKey);
if (worker != null) {
freeWorkers.addLast(worker);
dispatchPending();
}
// Slot re-registration is driven by the ingestor via a new WorkerReady.
}
private void handleWorkHeartbeat(String identityKey, byte[] payload) throws Exception {
@@ -325,6 +322,10 @@ public class IngestorBroker implements AutoCloseable {
}
}
/**
* Ingestor rejected the job. Re-queue it with a new ID.
* The ingestor will send a new typed WorkerReady when it's ready again.
*/
private void handleWorkReject(String identityKey, byte[] payload) throws Exception {
WorkReject reject = WorkReject.parseFrom(payload);
String jobId = reject.getJobId();
@@ -332,31 +333,23 @@ public class IngestorBroker implements AutoCloseable {
ActiveJob job = activeJobs.remove(jobId);
if (job != null) {
// Re-queue with fresh job_id so a different ingestor may pick it up
DataRequest requeued = job.request.toBuilder()
.setJobId(UUID.randomUUID().toString())
.build();
pendingJobs.add(requeued);
}
// Worker is still free (it rejected, not crashed)
WorkerInfo worker = knownWorkers.get(identityKey);
if (worker != null) {
freeWorkers.addLast(worker);
dispatchPending();
}
// Slot re-registration is driven by the ingestor via a new WorkerReady.
}
// ── Dispatch ─────────────────────────────────────────────────────────────
// ── Dispatch ─────────────────────────────────────────────────────────────
private void enqueueJob(DataRequest request) {
// Check if we can immediately dispatch
WorkerInfo worker = findFreeWorker(exchangeOf(request.getTicker()));
if (worker != null) {
dispatch(worker, request);
WorkerSlot slot = findFreeSlot(exchangeOf(request.getTicker()), request.getType());
if (slot != null) {
dispatch(slot, request);
} else {
pendingJobs.add(request);
LOG.debug("No free worker for {}, queued (pendingJobs={})", request.getTicker(), pendingJobs.size());
LOG.debug("No free slot for {}, queued (pendingJobs={})", request.getTicker(), pendingJobs.size());
}
}
@@ -364,9 +357,9 @@ public class IngestorBroker implements AutoCloseable {
Queue<DataRequest> remaining = new ArrayDeque<>();
DataRequest job;
while ((job = pendingJobs.poll()) != null) {
WorkerInfo worker = findFreeWorker(exchangeOf(job.getTicker()));
if (worker != null) {
dispatch(worker, job);
WorkerSlot slot = findFreeSlot(exchangeOf(job.getTicker()), job.getType());
if (slot != null) {
dispatch(slot, job);
} else {
remaining.add(job);
}
@@ -374,28 +367,30 @@ public class IngestorBroker implements AutoCloseable {
pendingJobs.addAll(remaining);
}
private void dispatch(WorkerInfo worker, DataRequest request) {
freeWorkers.remove(worker);
private void dispatch(WorkerSlot slot, DataRequest request) {
try {
byte[] protoBytes = request.toByteArray();
boolean sent = zmqManager.sendToWorker(worker.identity, PROTOCOL_VERSION, MSG_TYPE_WORK_ASSIGN, protoBytes);
boolean sent = zmqManager.sendToWorker(slot.identity, PROTOCOL_VERSION, MSG_TYPE_WORK_ASSIGN, protoBytes);
if (!sent) {
LOG.error("Failed to dispatch job to worker={}, re-queuing", worker.identityKey);
freeWorkers.addLast(worker);
// ROUTER_MANDATORY: identity is disconnected — purge all stale slots for this
// worker and re-queue the job so dispatchPending() can try a live slot.
int purged = purgeWorkerSlots(slot.identityKey);
LOG.warn("Worker {} unreachable, purged {} stale free slots, re-queuing job={}",
slot.identityKey, purged, request.getJobId());
pendingJobs.add(request);
return;
}
ActiveJob active = new ActiveJob(worker.identity, worker.identityKey,
ActiveJob active = new ActiveJob(slot.identity, slot.identityKey,
request, request.getTicker(), request.getType());
activeJobs.put(request.getJobId(), active);
LOG.info("Dispatched job: jobId={}, ticker={}, type={}, worker={}",
request.getJobId(), request.getTicker(), request.getType(), worker.identityKey);
LOG.info("Dispatched job: jobId={}, ticker={}, type={}, worker={}, slotType={}",
request.getJobId(), request.getTicker(), request.getType(),
slot.identityKey, slot.slotType);
} catch (Exception e) {
LOG.error("Error dispatching job", e);
freeWorkers.addLast(worker);
freeSlots.addLast(slot);
}
}
@@ -408,7 +403,7 @@ public class IngestorBroker implements AutoCloseable {
}
}
// ── Timeout checking ─────────────────────────────────────────────────────
// ── Timeout checking ─────────────────────────────────────────────────────
private void checkTimeouts() {
long now = System.currentTimeMillis();
@@ -426,10 +421,9 @@ public class IngestorBroker implements AutoCloseable {
for (String jobId : timedOut) {
ActiveJob job = activeJobs.remove(jobId);
if (job == null) continue;
LOG.warn("Job timed out (no heartbeat/completion): jobId={}, ticker={}, type={}, worker={}",
LOG.warn("Job timed out: jobId={}, ticker={}, type={}, worker={}",
jobId, job.ticker, job.type, job.workerIdentityKey);
// Re-queue with a new job_id
DataRequest requeued = job.request.toBuilder()
.setJobId(UUID.randomUUID().toString())
.build();
@@ -438,7 +432,7 @@ public class IngestorBroker implements AutoCloseable {
}
}
// ── Helpers ──────────────────────────────────────────────────────────────
// ── Helpers ──────────────────────────────────────────────────────────────
/** Extract exchange name from ticker, e.g. "BTC/USDT.BINANCE" → "BINANCE" */
private static String exchangeOf(String ticker) {
@@ -446,12 +440,32 @@ public class IngestorBroker implements AutoCloseable {
return dot >= 0 ? ticker.substring(dot + 1).toUpperCase() : "";
}
/** Find and remove a free worker that supports the given exchange. */
private WorkerInfo findFreeWorker(String exchange) {
for (WorkerInfo w : freeWorkers) {
if (exchange.isEmpty() || w.exchanges.contains(exchange)) {
freeWorkers.remove(w);
return w;
/**
* Remove all free slots offered by a given worker identity.
* Called when a dispatch to that identity fails (ROUTER_MANDATORY unreachable).
* Returns the number of slots removed.
*/
private int purgeWorkerSlots(String identityKey) {
int before = freeSlots.size();
freeSlots.removeIf(slot -> slot.identityKey.equals(identityKey));
return before - freeSlots.size();
}
/**
* Find and remove a free slot that supports the given exchange and request type.
* A slot with SlotType.ANY matches any request type.
*/
private WorkerSlot findFreeSlot(String exchange, DataRequest.RequestType requestType) {
for (WorkerSlot slot : freeSlots) {
boolean exchangeMatch = exchange.isEmpty() || slot.exchange.equals(exchange);
boolean typeMatch = slot.slotType == SlotType.ANY
|| (slot.slotType == SlotType.HISTORICAL
&& requestType == DataRequest.RequestType.HISTORICAL_OHLC)
|| (slot.slotType == SlotType.REALTIME
&& requestType == DataRequest.RequestType.REALTIME_TICKS);
if (exchangeMatch && typeMatch) {
freeSlots.remove(slot);
return slot;
}
}
return null;
@@ -468,17 +482,20 @@ public class IngestorBroker implements AutoCloseable {
stop();
}
// ── Inner types ──────────────────────────────────────────────────────────
// ── Inner types ──────────────────────────────────────────────────────────
private static class WorkerInfo {
byte[] identity;
/** One available work slot offered by an ingestor via WorkerReady. */
private static class WorkerSlot {
final byte[] identity;
final String identityKey;
Set<String> exchanges;
final String exchange;
final SlotType slotType;
WorkerInfo(byte[] identity, String identityKey, Set<String> exchanges) {
WorkerSlot(byte[] identity, String identityKey, String exchange, SlotType slotType) {
this.identity = identity;
this.identityKey = identityKey;
this.exchanges = exchanges;
this.exchange = exchange;
this.slotType = slotType;
}
}

View File

@@ -87,6 +87,11 @@ public class ZmqChannelManager implements Closeable {
socket.setLinger(1000);
socket.setSndHWM(10000);
socket.setRcvHWM(10000);
if (socketType == SocketType.ROUTER) {
// Return false (EHOSTUNREACH) instead of silently dropping messages to
// unknown/disconnected peer identities. Enables immediate stale-slot detection.
socket.setRouterMandatory(true);
}
socket.bind(endpoint);
sockets.put(channel.name(), socket);
LOG.info("Bound {} to {}", description, endpoint);

View File

@@ -595,28 +595,28 @@ export class WebSocketHandler {
case 'get_bars': {
if (!ohlcService) {
socket.send(JSON.stringify({
type: 'error',
type: 'get_bars_response',
request_id: requestId,
error_message: 'OHLC service not available'
error: 'OHLC service not available',
}));
break;
}
const history = await ohlcService.fetchOHLC(
payload.symbol,
payload.period_seconds,
payload.from_time,
payload.to_time,
payload.countback
);
logger.info({ requestId, barCount: history.bars?.length ?? 0, noData: history.noData, socketState: socket.readyState }, 'Sending get_bars_response');
socket.send(
jsonStringifySafe({
type: 'get_bars_response',
request_id: requestId,
history,
})
);
logger.info({ requestId }, 'get_bars_response sent');
try {
const history = await ohlcService.fetchOHLC(
payload.symbol,
payload.period_seconds,
payload.from_time,
payload.to_time,
payload.countback
);
logger.info({ requestId, barCount: history.bars?.length ?? 0, noData: history.noData, socketState: socket.readyState }, 'Sending get_bars_response');
socket.send(jsonStringifySafe({ type: 'get_bars_response', request_id: requestId, history }));
logger.info({ requestId }, 'get_bars_response sent');
} catch (err: any) {
const errorMessage = err?.message ?? String(err);
logger.error({ requestId, ticker: payload.symbol, errorMessage }, 'get_bars failed');
socket.send(JSON.stringify({ type: 'get_bars_response', request_id: requestId, error: errorMessage }));
}
break;
}

View File

@@ -1,6 +1,6 @@
import { Pool } from 'pg';
import type { UserLicense } from '../types/user.js';
import { UserLicenseSchema } from '../types/user.js';
import type { UserLicense, License, LicenseTier } from '../types/user.js';
import { UserLicenseSchema, LICENSE_TIER_TEMPLATES } from '../types/user.js';
import type { AuthService } from '../auth/auth-service.js';
export class UserService {
@@ -114,6 +114,54 @@ export class UserService {
return await this.authService.verifyToken(token);
}
/**
* Re-apply the current canonical template for every user's declared licenseType.
* Updates only the DB — does not touch deployments, so running pods are unaffected
* until their next natural restart.
*/
async migrateAllLicenses(): Promise<{ updated: number }> {
const client = await this.pool.connect();
try {
const rows = await client.query(
`SELECT user_id, license->>'licenseType' AS tier FROM user_licenses`
);
let updated = 0;
for (const row of rows.rows) {
const tier = row.tier as LicenseTier;
if (!LICENSE_TIER_TEMPLATES[tier]) continue;
await client.query(
`UPDATE user_licenses SET license = $1::jsonb, updated_at = NOW() WHERE user_id = $2`,
[JSON.stringify(LICENSE_TIER_TEMPLATES[tier]), row.user_id]
);
updated++;
}
return { updated };
} finally {
client.release();
}
}
/**
* Set a user's license to a canonical tier template.
* Overwrites the existing license with the current template for that tier.
*/
async setUserLicenseTier(userId: string, tier: LicenseTier): Promise<License> {
const license = LICENSE_TIER_TEMPLATES[tier];
const client = await this.pool.connect();
try {
await client.query(
`INSERT INTO user_licenses (user_id, license, mcp_server_url, updated_at)
VALUES ($1, $2::jsonb, 'pending', NOW())
ON CONFLICT (user_id) DO UPDATE
SET license = EXCLUDED.license, updated_at = NOW()`,
[userId, JSON.stringify(license)]
);
} finally {
client.release();
}
return license;
}
/**
* Close database pool
*/

View File

@@ -16,6 +16,7 @@ import type { ResearchSubagent } from './subagents/research/index.js';
import type { IndicatorSubagent } from './subagents/indicator/index.js';
import type { WebExploreSubagent } from './subagents/web-explore/index.js';
import type { StrategySubagent } from './subagents/strategy/index.js';
import { BaseSubagent } from './subagents/base-subagent.js';
import type { DynamicStructuredTool } from '@langchain/core/tools';
import { getToolRegistry } from '../tools/tool-registry.js';
import type { MCPToolInfo } from '../tools/mcp/mcp-tool-wrapper.js';
@@ -237,12 +238,22 @@ export class AgentHarness {
try {
const { createResearchSubagent } = await import('./subagents/research/index.js');
// Create a model for the research subagent
// Path resolution: use the compiled output path
const researchSubagentPath = join(__dirname, 'subagents', 'research');
this.config.logger.debug({ researchSubagentPath }, 'Using research subagent path');
// Load the subagent config to get maxTokens — research scripts require more tokens
// than the provider default (4096) because python_write arguments include full code bodies
const researchSubagentConfig = await BaseSubagent.loadConfig(researchSubagentPath);
// Create a model for the research subagent — always use the complex model
// since research tasks involve data analysis, charting, and code generation
const { model } = await this.modelRouter.route(
'research analysis', // dummy query
'analyze and backtest research data', // triggers complex routing
this.config.license,
RoutingStrategy.COMPLEXITY,
this.config.userId
this.config.userId,
researchSubagentConfig.maxTokens // honour the subagent's maxTokens (e.g. 8192)
);
// Get tools for research subagent from registry
@@ -274,10 +285,6 @@ export class AgentHarness {
}));
}
// Path resolution: use the compiled output path
const researchSubagentPath = join(__dirname, 'subagents', 'research');
this.config.logger.debug({ researchSubagentPath }, 'Using research subagent path');
this.researchSubagent = await createResearchSubagent(
model,
this.config.logger,
@@ -535,10 +542,12 @@ export class AgentHarness {
const stream = await model.stream(messagesCopy, { signal });
for await (const chunk of stream) {
if (typeof chunk.content === 'string' && chunk.content.length > 0) {
this.config.logger.trace({ content: chunk.content }, 'raw chunk');
yield { type: 'chunk', content: chunk.content };
} else if (Array.isArray(chunk.content)) {
for (const block of chunk.content) {
if (block.type === 'text' && block.text) {
this.config.logger.trace({ content: block.text }, 'raw chunk');
yield { type: 'chunk', content: block.text };
}
}

View File

@@ -18,8 +18,11 @@ Dexorder trading platform provides OHLC data at a 1-minute resolution and suppor
Dexorder does not support:
* tick-by-tick trading or high-frequency strategies.
* long-running computations like paramater optimizations or training machine learning models.
* long-running computations like parameter optimizations or training machine learning models during live execution.
* portfolio optimization or trading strategies that require a large number of symbols.
* LLM calls inside strategy scripts — strategies must be deterministic and lightweight for backtesting to be reliable and repeatable. LLMs are slow, expensive, and introduce temperature-based non-determinism that breaks backtesting. (Walk-forward LLM integration via timer/data triggers is planned but not yet available.)
* TradFi data (equities, forex, bonds, options, etc.) — only crypto pricing data is available.
* Alternative data sources such as news feeds, Twitter/social sentiment, on-chain data, or economic calendars — these are not yet available.
Dexorder does support:
* backtesting strategies against historical data.
@@ -33,6 +36,27 @@ If the user asks for a capability not provided by Dexorder, decline and explain
# Important Instructions
## Switching Chart Symbol or Timeframe
**IMPORTANT: When the user asks to switch, change, or update the chart symbol or timeframe, you MUST call `workspace_patch` directly. Do NOT use web_explore, do NOT delegate to the indicator tool.**
Call `workspace_patch` with `store_name = "chartState"` and the appropriate JSON patch:
To switch symbol only:
```json
[{ "op": "replace", "path": "/symbol", "value": "SOL/USDT.BINANCE" }]
```
To switch symbol and period (period is seconds: 60=1m, 300=5m, 900=15m, 3600=1h, 86400=1D):
```json
[
{ "op": "replace", "path": "/symbol", "value": "SOL/USDT.BINANCE" },
{ "op": "replace", "path": "/period", "value": 900 }
]
```
You already know this format — do not search for it. After patching, confirm the change to the user.
## Investment Advice
**NEVER** recommend any specific ticker, trade, or position. You may suggest mechanical adjustments or improvements to strategies, but you must **NEVER** offer an opinion on a specific trade or position. You are **NOT** a registered investment advisor.

View File

@@ -1 +1 @@
This is your first chat with a new user. Welcome them to Dexorder and describe who are you and what can you do.
This is your first chat with a new user. Welcome them to Dexorder, and describe who you are and what can you do.

View File

@@ -83,6 +83,15 @@ self.config.initial_capital # starting capital in quote currency
| `sell_vol` | float | Sell-side volume (taker sells) |
| `open_interest` | float | Open interest (futures only; NaN for spot) |
### Available data — crypto only
Strategies have access **only** to crypto OHLC feeds with volume, buy/sell volume split, and open interest. The following are **not available** and must never be referenced in a strategy:
- **TradFi data** — equities, forex, bonds, futures spreads, options, macro indicators, interest rates, etc.
- **Alternative data** — news feeds, social sentiment (Twitter/Reddit), on-chain metrics, economic calendars, earnings, etc.
If a user requests a strategy that depends on unavailable data, explain the limitation and offer a crypto-native alternative (e.g. use order-flow imbalance instead of news sentiment).
---
## Section B — Strategy Metadata
@@ -355,3 +364,16 @@ deactivate_strategy(strategy_name) # Stop and get final PnL
- 4h bars: 100k bars ≈ 45 years → cap at 5 years (≈ 10,950 bars)
7. **Never `import` from `dexorder` inside `evaluate()`** — the strategy file is exec'd in a sandbox with PandasStrategy and pandas_ta pre-loaded. Standard library and pandas/numpy/pandas_ta are available.
8. **No LLM calls inside strategies** — strategies must be fully deterministic. LLM invocations are prohibited because:
- They are slow and expensive, making backtesting impractical.
- Any temperature > 0 produces non-repeatable outputs, breaking backtest reproducibility.
- The correct model is: the LLM *writes* the strategy; the strategy runs without LLM involvement.
- Walk-forward LLM integration (via timer or data triggers) is a planned feature but is **not yet implemented**. Do not attempt to approximate it now.
9. **`evaluate()` must be fast, lightweight, and deterministic** — it is called on every bar during backtesting across potentially hundreds of thousands of bars. Specifically:
- **No heavy computation at runtime**: model inference, large matrix operations, file I/O, network calls, or database queries are forbidden inside `evaluate()`.
- **ML is allowed with restrictions**: a model may be trained offline (e.g. in `__init__` using warm-up data), but inference in `evaluate()` must be fast (microseconds, not milliseconds). If training is compute-intensive, note this clearly in the strategy description.
- **No randomness**: do not use `random`, `np.random`, or any non-seeded stochastic operation. All outputs given the same data must be identical across runs.
10. **Data scope** — strategies may only use data available in the `dfs` feeds. Do not attempt to fetch external data, call APIs, read files, or access anything outside the provided DataFrames. Crypto OHLCV + buy/sell volume + open interest is what is available; nothing else.

View File

@@ -306,6 +306,25 @@ export class KubernetesClient {
}
}
/**
* Delete only the Deployment, preserving PVC (user data) and Service (stable DNS).
* Used when applying a license tier change — next ensureContainerRunning recreates
* the deployment with updated resource limits.
*/
async deleteDeploymentOnly(userId: string): Promise<void> {
const deploymentName = KubernetesClient.getDeploymentName(userId);
try {
await this.appsApi.deleteNamespacedDeployment({
name: deploymentName,
namespace: this.config.namespace
});
this.config.logger.info({ deploymentName }, 'Deleted deployment (tier change)');
} catch (error: any) {
const is404 = error.code === 404 || error.response?.statusCode === 404 || error.statusCode === 404;
if (!is404) throw error;
}
}
/**
* Delete deployment and associated resources
* (Used for cleanup/testing - normally handled by lifecycle sidecar)

View File

@@ -1,9 +1,11 @@
import type { FastifyBaseLogger } from 'fastify';
import { KubernetesClient, type DeploymentSpec } from './client.js';
import type { License } from '../types/user.js';
import type { License, LicenseTier } from '../types/user.js';
import type { UserService } from '../db/user-service.js';
export interface ContainerManagerConfig {
k8sClient: KubernetesClient;
userService: UserService;
sandboxImage: string;
sidecarImage: string;
storageClass: string;
@@ -139,6 +141,17 @@ export class ContainerManager {
return { exists: true, ready, mcpEndpoint };
}
/**
* Apply a canonical license tier to a user: updates DB and deletes the deployment
* so it is recreated with the new resource limits on next connect.
*/
async applyLicenseTier(userId: string, tier: LicenseTier): Promise<License> {
const license = await this.config.userService.setUserLicenseTier(userId, tier);
await this.config.k8sClient.deleteDeploymentOnly(userId);
this.config.logger.info({ userId, tier }, 'License tier applied; deployment will recreate on next connect');
return license;
}
/**
* Delete container (for cleanup/testing)
*/

View File

@@ -42,7 +42,8 @@ export class ModelRouter {
message: string,
license: License,
strategy: RoutingStrategy = RoutingStrategy.USER_PREFERENCE,
userId?: string
userId?: string,
maxTokens?: number
): Promise<{ model: BaseChatModel; middleware: ModelMiddleware }> {
let modelConfig: ModelConfig;
@@ -67,12 +68,17 @@ export class ModelRouter {
modelConfig = this.defaultModel;
}
if (maxTokens !== undefined) {
modelConfig = { ...modelConfig, maxTokens };
}
this.logger.info(
{
userId,
strategy,
provider: modelConfig.provider,
model: modelConfig.model,
maxTokens: modelConfig.maxTokens,
},
'Routing to model'
);

View File

@@ -22,6 +22,7 @@ import { AgentHarness, type HarnessSessionConfig } from './harness/agent-harness
import { OHLCService } from './services/ohlc-service.js';
import { SymbolIndexService } from './services/symbol-index-service.js';
import { SymbolRoutes } from './routes/symbol-routes.js';
import { AdminRoutes } from './routes/admin-routes.js';
// Catch unhandled promise rejections for better debugging
process.on('unhandledRejection', (reason: any, promise) => {
@@ -309,6 +310,7 @@ const k8sClient = new KubernetesClient({
const containerManager = new ContainerManager({
k8sClient,
userService,
sandboxImage: config.kubernetes.sandboxImage,
sidecarImage: config.kubernetes.sidecarImage,
storageClass: config.kubernetes.storageClass,
@@ -439,6 +441,9 @@ const getSymbolService = () => symbolIndexService;
const symbolRoutes = new SymbolRoutes({ getSymbolIndexService: getSymbolService });
symbolRoutes.register(app);
// Register admin routes
new AdminRoutes(containerManager, userService).register(app);
app.log.debug('All routes registered');
// Health check
@@ -715,7 +720,6 @@ try {
icebergClient,
logger: app.log,
});
await indexService.initialize();
// Assign to module-level variable so onMetadataUpdate callback can use it
symbolIndexService = indexService;
@@ -723,7 +727,17 @@ try {
// Update websocket handler's config so it can use the service
(websocketHandler as any).config.symbolIndexService = indexService;
app.log.info({ stats: symbolIndexService.getStats() }, 'Symbol index service initialized');
// Retry until we get at least some symbol metadata
while (true) {
await indexService.initialize();
const stats = indexService.getStats();
if (stats.symbolCount > 0) {
app.log.info({ stats }, 'Symbol index service initialized');
break;
}
app.log.warn('Symbol index has no metadata yet, retrying in 5 seconds...');
await new Promise(resolve => setTimeout(resolve, 5000));
}
} catch (error) {
app.log.warn({ error }, 'Failed to initialize symbol index service - symbol search will not be available');
}

View File

@@ -0,0 +1,35 @@
import type { FastifyInstance } from 'fastify';
import type { ContainerManager } from '../k8s/container-manager.js';
import type { UserService } from '../db/user-service.js';
import type { LicenseTier } from '../types/user.js';
const VALID_TIERS: LicenseTier[] = ['free', 'pro', 'enterprise'];
export class AdminRoutes {
private containerManager: ContainerManager;
private userService: UserService;
constructor(containerManager: ContainerManager, userService: UserService) {
this.containerManager = containerManager;
this.userService = userService;
}
register(app: FastifyInstance): void {
app.post<{ Params: { userId: string }; Body: { tier: string } }>(
'/admin/users/:userId/set-tier',
async (req, reply) => {
const { userId } = req.params;
const { tier } = req.body;
if (!VALID_TIERS.includes(tier as LicenseTier)) {
return reply.code(400).send({ error: `Invalid tier. Must be one of: ${VALID_TIERS.join(', ')}` });
}
const license = await this.containerManager.applyLicenseTier(userId, tier as LicenseTier);
return { userId, tier, license };
}
);
app.post('/admin/migrate-licenses', async () => {
return await this.userService.migrateAllLicenses();
});
}
}

View File

@@ -167,11 +167,7 @@ export class OHLCService {
period_seconds,
}, 'Failed to fetch historical data');
// Return empty result on error
return {
bars: [],
noData: true,
};
throw error;
}
}

View File

@@ -0,0 +1,87 @@
/**
* Direct DeepInfra streaming test — bypasses LangChain entirely.
* Logs each delta.content with JSON.stringify so spaces are unambiguous.
*
* Usage:
* DEEPINFRA_API_KEY=$(op read "op://Private/DeepInfra/credential") npx tsx src/test-deepinfra-chunks.ts
*/
export {};
const DEEP_INFRA_URL = 'https://api.deepinfra.com/v1/openai/chat/completions';
const MODEL = 'zai-org/GLM-5';
const apiKey = process.env.DEEPINFRA_API_KEY;
if (!apiKey) {
console.error('DEEPINFRA_API_KEY is not set');
process.exit(1);
}
const res = await fetch(DEEP_INFRA_URL, {
method: 'POST',
headers: {
Authorization: `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: MODEL,
stream: true,
messages: [
{ role: 'user', content: 'Write two sentences about ETH price analysis.' },
],
}),
});
if (!res.ok || !res.body) {
console.error(`HTTP ${res.status}: ${await res.text()}`);
process.exit(1);
}
const reader = res.body.getReader();
const decoder = new TextDecoder();
let chunkIndex = 0;
let assembled = '';
console.log(`Testing model: ${MODEL}`);
console.log('--- chunks ---');
while (true) {
const { value, done } = await reader.read();
if (done) break;
const text = decoder.decode(value, { stream: true });
for (const line of text.split('\n')) {
const trimmed = line.trim();
if (!trimmed.startsWith('data:')) continue;
const data = trimmed.slice(5).trimStart();
if (data === '[DONE]') break;
let parsed: unknown;
try {
parsed = JSON.parse(data);
} catch {
continue;
}
const choice = (parsed as { choices?: Array<{ delta?: Record<string, unknown> }> })
?.choices?.[0];
const delta = choice?.delta;
const content = delta?.content as string | undefined;
if (content !== undefined) {
const endsSpace = content.endsWith(' ');
const startsSpace = content.startsWith(' ');
// Log full delta so we can see all available fields (logprobs, token_ids, etc.)
console.log(
`chunk[${chunkIndex++}]: ${JSON.stringify(content)} ` +
`(len=${content.length}, startsSpace=${startsSpace}, endsSpace=${endsSpace}) ` +
`delta=${JSON.stringify(delta)}`,
);
assembled += content;
}
}
}
console.log('--- assembled ---');
console.log(assembled);

View File

@@ -42,7 +42,8 @@ Use this tool for:
- Recommending indicators for a given strategy or analysis goal
ALWAYS use this tool for any request about the chart's indicators.
NEVER modify the indicators workspace store directly.`,
NEVER modify the indicators workspace store directly.
NEVER use this tool to switch the chart symbol or timeframe — that is done via workspace_patch on chartState.`,
schema: z.object({
instruction: z.string().describe(
'The indicator task to perform. Be specific about which indicators, parameters, ' +

View File

@@ -30,13 +30,18 @@ export function createWebExploreAgentTool(config: WebExploreAgentToolConfig): Dy
const tool = new DynamicStructuredTool({
name: 'web_explore',
description: `Search the web or academic databases and return a summarized answer.
description: `Search the EXTERNAL web or academic databases and return a summarized answer.
Use this tool when the user asks about:
Use this tool ONLY for external, public information:
- Current events, news, or real-time information
- Documentation, tutorials, or how-to guides
- External documentation, tutorials, or how-to guides for third-party libraries/tools
- Academic papers, research findings, or scientific topics
- Any topic that benefits from external sources
- Any topic requiring external sources
NEVER use this tool for:
- Questions about the Dexorder platform itself (workspace tools, chartState, indicators, strategies)
- Internal API usage (workspace_patch, workspace_read, etc.) — consult the system prompt instead
- Anything that can be answered from the context already available
The subagent will search the web (or arXiv for academic queries), fetch relevant content, and return a markdown summary with cited sources.`,
schema: z.object({

View File

@@ -76,7 +76,7 @@ export const LICENSE_TIER_TEMPLATES: Record<LicenseTier, License> = {
maxTokensPerMessage: 4096, rateLimitPerMinute: 10,
},
k8sResources: {
memoryRequest: '256Mi', memoryLimit: '512Mi',
memoryRequest: '256Mi', memoryLimit: '8Gi',
cpuRequest: '100m', cpuLimit: '500m',
storage: '1Gi', tmpSizeLimit: '128Mi',
enableIdleShutdown: true, idleTimeoutMinutes: 15,
@@ -93,7 +93,7 @@ export const LICENSE_TIER_TEMPLATES: Record<LicenseTier, License> = {
maxTokensPerMessage: 8192, rateLimitPerMinute: 60,
},
k8sResources: {
memoryRequest: '512Mi', memoryLimit: '2Gi',
memoryRequest: '512Mi', memoryLimit: '8Gi',
cpuRequest: '250m', cpuLimit: '2000m',
storage: '10Gi', tmpSizeLimit: '256Mi',
enableIdleShutdown: false, idleTimeoutMinutes: 0,
@@ -110,7 +110,7 @@ export const LICENSE_TIER_TEMPLATES: Record<LicenseTier, License> = {
maxTokensPerMessage: 32768, rateLimitPerMinute: 300,
},
k8sResources: {
memoryRequest: '1Gi', memoryLimit: '4Gi',
memoryRequest: '1Gi', memoryLimit: '8Gi',
cpuRequest: '500m', cpuLimit: '4000m',
storage: '50Gi', tmpSizeLimit: '512Mi',
enableIdleShutdown: false, idleTimeoutMinutes: 0,

View File

@@ -79,12 +79,12 @@ export interface StoreConfig {
export const DEFAULT_STORES: StoreConfig[] = [
{
name: 'chartState',
persistent: false,
persistent: true,
initialState: () => ({
symbol: 'BTC/USDT.BINANCE',
start_time: null,
end_time: null,
period: '15',
period: 900,
selected_shapes: [],
}),
},

View File

@@ -1,6 +1,37 @@
// CCXT data fetcher for historical OHLC and realtime ticks
import ccxt from 'ccxt';
/**
* Thrown when an exchange returns a 429 rate-limit response.
* retryAfterMs is derived from the exchange's Retry-After header when available.
*/
export class ExchangeRateLimitError extends Error {
constructor(exchange, retryAfterMs, originalMessage) {
super(`Rate limit on ${exchange}: retry after ${retryAfterMs}ms (${originalMessage})`);
this.name = 'ExchangeRateLimitError';
this.exchange = exchange.toUpperCase();
this.retryAfterMs = retryAfterMs;
}
}
/**
* Extract retry-after duration in milliseconds from a CCXT RateLimitExceeded error.
* Priority: Retry-After header → error message numeric → 30s fallback.
*/
function extractRetryAfterMs(exchange, error) {
const header = exchange.last_response_headers?.['retry-after'];
if (header) {
const secs = parseFloat(header);
if (!isNaN(secs)) return Math.ceil(secs * 1000);
}
// Some exchanges embed the delay in the message (e.g. "retry after 5000 ms")
const msMatch = error.message?.match(/(\d+)\s*ms/i);
if (msMatch) return parseInt(msMatch[1], 10);
const secMatch = error.message?.match(/(\d+(?:\.\d+)?)\s*s(?:ec|econds?)?/i);
if (secMatch) return Math.ceil(parseFloat(secMatch[1]) * 1000);
return 30_000;
}
export class CCXTFetcher {
constructor(config, logger, metadataGenerator = null) {
this.config = config;
@@ -135,9 +166,12 @@ export class CCXTFetcher {
break;
} catch (error) {
lastError = error;
const isRetryable = error.constructor?.name === 'NetworkError' ||
const isRateLimit = error.constructor?.name === 'RateLimitExceeded';
const isRetryable = !isRateLimit && (
error.constructor?.name === 'NetworkError' ||
error.constructor?.name === 'RequestTimeout' ||
error.constructor?.name === 'ExchangeNotAvailable';
error.constructor?.name === 'ExchangeNotAvailable'
);
this.logger.warn(
{
errorType: error.constructor?.name,
@@ -146,15 +180,21 @@ export class CCXTFetcher {
ticker,
since,
attempt,
retryable: isRetryable
retryable: isRetryable,
rateLimit: isRateLimit
},
'OHLC fetch attempt failed'
);
if (!isRetryable || attempt === FETCH_RETRIES) break;
if (isRateLimit || !isRetryable || attempt === FETCH_RETRIES) break;
await exchange.sleep(FETCH_RETRY_DELAY_MS * attempt);
}
}
if (lastError) {
if (lastError.constructor?.name === 'RateLimitExceeded') {
const retryAfterMs = extractRetryAfterMs(exchange, lastError);
this.logger.warn({ ticker, retryAfterMs }, 'OHLC fetch rate-limited by exchange');
throw new ExchangeRateLimitError(exchangeName, retryAfterMs, lastError.message);
}
this.logger.error(
{
errorType: lastError.constructor?.name,
@@ -278,6 +318,11 @@ export class CCXTFetcher {
// Convert to our Tick format
return trades.map(trade => this.convertToTick(trade, ticker, metadata));
} catch (error) {
if (error.constructor?.name === 'RateLimitExceeded') {
const retryAfterMs = extractRetryAfterMs(exchange, error);
this.logger.warn({ ticker, retryAfterMs }, 'Trades fetch rate-limited by exchange');
throw new ExchangeRateLimitError(exchangeName, retryAfterMs, error.message);
}
this.logger.error(
{ error: error.message, ticker },
'Error fetching trades'

View File

@@ -6,9 +6,10 @@ import { parse as parseYaml } from 'yaml';
import pino from 'pino';
import { ZmqClient } from './zmq-client.js';
import { KafkaProducer } from './kafka-producer.js';
import { CCXTFetcher } from './ccxt-fetcher.js';
import { CCXTFetcher, ExchangeRateLimitError } from './ccxt-fetcher.js';
import { RealtimePoller } from './realtime-poller.js';
import { SymbolMetadataGenerator } from './symbol-metadata-generator.js';
import { SlotType } from './proto/messages.js';
// Logger setup
const logger = pino({
@@ -64,10 +65,162 @@ function loadConfig() {
supported_exchanges: config.supported_exchanges || ['binance', 'coinbase', 'kraken'],
symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000,
// Per-exchange slot capacity
exchange_capacity: config.exchange_capacity || {
BINANCE: { historical_slots: 3, realtime_slots: 5 },
KRAKEN: { historical_slots: 2, realtime_slots: 3 },
COINBASE: { historical_slots: 2, realtime_slots: 4 }
},
...secrets
};
}
/**
* Manages work slots per exchange per job type.
*
* Each slot corresponds to one WorkerReady message sent to Flink. Flink consumes
* a slot when it dispatches a job. The slot is re-offered (via another WorkerReady)
* once the job completes, subject to any rate-limit backoff dictated by the exchange.
*/
class SlotPool {
constructor(exchangeCapacity, zmqClient, logger) {
this.zmqClient = zmqClient;
this.logger = logger;
// Key: 'EXCHANGE|TYPE' (e.g. 'BINANCE|HISTORICAL')
// Value: { max, active: Set<jobId>, backoffUntil: ms timestamp }
this.slots = new Map();
for (const [exchange, cap] of Object.entries(exchangeCapacity)) {
const ex = exchange.toUpperCase();
this.slots.set(`${ex}|HISTORICAL`, {
max: cap.historical_slots ?? 2,
active: new Set(),
backoffUntil: 0
});
this.slots.set(`${ex}|REALTIME`, {
max: cap.realtime_slots ?? 3,
active: new Set(),
backoffUntil: 0
});
}
// jobId → { exchange, type } for release tracking
this.jobMap = new Map();
}
/**
* Register the onConnected callback so slot offers are sent on every
* TCP (re)connect rather than once at startup. Handles both the initial
* connection race (Flink ROUTER not yet ready) and Flink restarts.
*/
init() {
this.zmqClient.onConnected = () => this._offerAllFreeSlots();
this.logger.info(
{ slots: [...this.slots.entries()].map(([k, v]) => `${k}:${v.max}`) },
'Slot pool initialized — will offer slots on connect'
);
}
/**
* Re-offer all currently-free slots. Called on every TCP (re)connect.
* Sends (max - active) WorkerReady messages per exchange+type key.
*/
async _offerAllFreeSlots() {
const summary = [];
for (const [key, slot] of this.slots) {
const [exchange, type] = key.split('|');
const freeCount = slot.max - slot.active.size;
for (let i = 0; i < freeCount; i++) {
await this.zmqClient.sendTypedReady(exchange, SlotType[type]);
}
summary.push(`${key}:${freeCount}/${slot.max}`);
}
this.logger.info({ offered: summary }, 'Re-offered all free slots on connect');
}
/**
* Record a slot as occupied by jobId.
* @param {string} jobId
* @param {string} exchange - e.g. 'BINANCE'
* @param {string} type - 'HISTORICAL' | 'REALTIME'
*/
consumeSlot(jobId, exchange, type) {
const key = `${exchange.toUpperCase()}|${type}`;
const slot = this.slots.get(key);
if (slot) {
if (slot.active.size >= slot.max) {
this.logger.warn({ jobId, key, active: slot.active.size, max: slot.max }, 'Slot capacity exceeded — rejecting job');
return false;
}
slot.active.add(jobId);
this.jobMap.set(jobId, { exchange: exchange.toUpperCase(), type });
this.logger.debug({ jobId, key, active: slot.active.size, max: slot.max }, 'Slot consumed');
return true;
}
this.logger.warn({ jobId, key }, 'No slot config for this exchange+type');
return false;
}
/**
* Release the slot occupied by jobId and re-offer it to Flink (after any backoff).
*/
async releaseSlot(jobId) {
const info = this.jobMap.get(jobId);
if (!info) {
this.logger.warn({ jobId }, 'releaseSlot called for unknown jobId');
return;
}
this.jobMap.delete(jobId);
const key = `${info.exchange}|${info.type}`;
const slot = this.slots.get(key);
if (slot) {
slot.active.delete(jobId);
await this._offerSlot(info.exchange, info.type, slot);
}
}
/**
* Record a rate limit from the exchange. Delays slot re-offer by retryAfterMs.
* @param {string} exchange
* @param {string} type - 'HISTORICAL' | 'REALTIME'
* @param {number} retryAfterMs
*/
reportRateLimit(exchange, type, retryAfterMs) {
const key = `${exchange.toUpperCase()}|${type}`;
const slot = this.slots.get(key);
if (slot) {
slot.backoffUntil = Math.max(slot.backoffUntil, Date.now() + retryAfterMs);
this.logger.warn({ exchange, type, retryAfterMs }, 'Rate limit backoff set for slot');
}
}
async _offerSlot(exchange, type, slot) {
const now = Date.now();
if (now < slot.backoffUntil) {
const delay = slot.backoffUntil - now;
this.logger.info({ exchange, type, delayMs: delay }, 'Slot in backoff — scheduling re-offer');
setTimeout(() => this._offerSlot(exchange, type, slot), delay);
return;
}
try {
await this.zmqClient.sendTypedReady(exchange, SlotType[type]);
this.logger.debug({ exchange, type }, 'Slot re-offered to Flink');
} catch (err) {
this.logger.error({ exchange, type, error: err.message }, 'Failed to re-offer slot');
}
}
shutdown() {}
}
/** Extract exchange name from ticker string, e.g. "BTC/USDT.BINANCE" → "BINANCE" */
function exchangeOf(ticker) {
const lastDot = ticker?.lastIndexOf('.');
return (lastDot >= 0) ? ticker.slice(lastDot + 1).toUpperCase() : 'UNKNOWN';
}
class IngestorWorker {
constructor(config, logger) {
this.config = config;
@@ -92,7 +245,22 @@ class IngestorWorker {
logger.child({ component: 'poller' })
);
// jobId → active realtime subscription (for stop handling)
this.pool = new SlotPool(
config.exchange_capacity,
this.zmqClient,
logger.child({ component: 'pool' })
);
// When realtime poller terminates a subscription due to repeated errors, release its slot.
this.realtimePoller.onJobComplete = (jobId, error) => {
if (error instanceof ExchangeRateLimitError) {
this.pool.reportRateLimit(error.exchange, 'REALTIME', error.retryAfterMs);
}
this.pool.releaseSlot(jobId).catch(err =>
this.logger.error({ jobId, error: err.message }, 'Failed to release slot after realtime error'));
};
// jobId set for active realtime subscriptions
this.activeRealtime = new Set();
this.isShutdown = false;
@@ -108,7 +276,10 @@ class IngestorWorker {
this.zmqClient.onWorkAssign = req => this.handleWorkAssign(req);
this.zmqClient.onWorkStop = jobId => this.handleWorkStop(jobId);
await this.zmqClient.connect(); // also sends WorkerReady
// Register slot offer callback before connecting so we don't miss the event
this.pool.init();
await this.zmqClient.connect();
// Generate symbol metadata on startup
this.logger.info('Generating initial symbol metadata');
@@ -139,18 +310,26 @@ class IngestorWorker {
*/
handleWorkAssign(request) {
const { jobId, requestId, type, ticker } = request;
const exchange = exchangeOf(ticker);
this.logger.info({ jobId, requestId, type, ticker }, 'Received WorkAssign');
this.logger.info({ jobId, requestId, type, ticker, exchange }, 'Received WorkAssign');
// HISTORICAL_OHLC = 0 (proto3 default, may appear as undefined or 'HISTORICAL_OHLC')
const isHistorical = !type || type === 'HISTORICAL_OHLC' || type === 0;
const isRealtime = type === 'REALTIME_TICKS' || type === 1;
if (isHistorical) {
if (!this.pool.consumeSlot(jobId, exchange, 'HISTORICAL')) {
this.zmqClient.sendReject(jobId, 'Slot capacity exceeded').catch(() => {});
return;
}
this.handleHistoricalRequest(request).catch(err => {
this.logger.error({ jobId, requestId, error: err.message }, 'Unexpected error in historical handler');
});
} else if (isRealtime) {
if (!this.pool.consumeSlot(jobId, exchange, 'REALTIME')) {
this.zmqClient.sendReject(jobId, 'Slot capacity exceeded').catch(() => {});
return;
}
this.handleRealtimeRequest(request);
} else {
this.logger.warn({ jobId, type }, 'Unknown request type — rejecting');
@@ -165,7 +344,9 @@ class IngestorWorker {
this.logger.info({ jobId }, 'Received WorkStop — cancelling realtime subscription');
this.realtimePoller.cancelSubscription(jobId);
this.activeRealtime.delete(jobId);
// No WorkComplete needed — Flink sent the stop, it already knows
this.pool.releaseSlot(jobId).catch(err =>
this.logger.warn({ jobId, error: err.message }, 'Failed to release slot after WorkStop'));
// No WorkComplete needed — Flink sent the stop, it already knows.
}
/**
@@ -174,10 +355,14 @@ class IngestorWorker {
*/
async handleHistoricalRequest(request) {
const { jobId, requestId, ticker, historical, clientId: client_id } = request;
const exchange = exchangeOf(ticker);
const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical || {};
this.logger.info({ jobId, requestId, ticker, period_seconds }, 'Processing historical OHLC request');
// Immediately ack to reset Flink's dispatch-time timeout clock.
await this.zmqClient.sendHeartbeat(jobId);
try {
const candles = await this.ccxtFetcher.fetchHistoricalOHLC(
ticker, start_time, end_time, period_seconds, limit
@@ -193,7 +378,10 @@ class IngestorWorker {
const isLastPage = (i + PAGE_SIZE) >= candles.length;
await this.kafkaProducer.writeOHLCs(this.config.kafka_ohlc_topic, page, metadata, isLastPage);
}
this.logger.info({ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) }, 'Wrote all pages to Kafka');
this.logger.info(
{ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) },
'Wrote all pages to Kafka'
);
} else {
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
@@ -207,6 +395,10 @@ class IngestorWorker {
} catch (error) {
this.logger.error({ jobId, requestId, ticker, error: error.message }, 'Historical request failed');
if (error instanceof ExchangeRateLimitError) {
this.pool.reportRateLimit(exchange, 'HISTORICAL', error.retryAfterMs);
}
try {
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
@@ -218,11 +410,14 @@ class IngestorWorker {
await this.zmqClient.sendComplete(jobId, false, error.message);
}
// Release slot regardless of success or failure
this.pool.releaseSlot(jobId).catch(err =>
this.logger.error({ jobId, error: err.message }, 'Failed to release historical slot'));
}
/**
* Start realtime tick polling for a job dispatched by Flink.
* Ticks flow: exchange → Kafka market-tick → Flink → OHLC bars → clients.
*/
handleRealtimeRequest(request) {
const { jobId, requestId, ticker } = request;
@@ -247,6 +442,7 @@ class IngestorWorker {
if (this.metadataInterval) clearInterval(this.metadataInterval);
this.pool.shutdown();
this.realtimePoller.shutdown();
await this.ccxtFetcher.close();
await this.metadataGenerator.close();

View File

@@ -18,6 +18,10 @@ export class RealtimePoller {
this.pollingLoop = null;
this.heartbeatLoop = null;
// Called with (jobId, error) when a subscription terminates abnormally.
// Set by IngestorWorker to release the slot in SlotPool.
this.onJobComplete = null;
}
/**
@@ -147,6 +151,7 @@ export class RealtimePoller {
} catch (zmqErr) {
this.logger.error({ jobId, error: zmqErr.message }, 'Failed to send WorkComplete after error');
}
if (this.onJobComplete) this.onJobComplete(jobId, error);
}
}
}

View File

@@ -28,63 +28,61 @@ export class ZmqClient {
this.dealerSocket = null;
this.isShutdown = false;
this.activeJobId = null;
this._idleHeartbeatInterval = null;
this.supportedExchanges = (config.supported_exchanges || ['BINANCE', 'COINBASE'])
.map(e => e.toUpperCase());
// Callbacks set by IngestorWorker
this.onWorkAssign = null; // (DataRequest) => void
this.onWorkStop = null; // (jobId) => void
// Callbacks set by IngestorWorker / SlotPool
this.onWorkAssign = null; // (DataRequest) => void
this.onWorkStop = null; // (jobId) => void
this.onConnected = null; // async () => void — fires on initial connect AND reconnect
}
/**
* Connect DEALER socket to Flink IngestorBroker (ROUTER).
* Sends WorkerReady immediately so Flink knows this worker is available.
* Fires onConnected on every TCP (re)connect so SlotPool can re-offer slots.
*/
async connect() {
const { flink_hostname, ingestor_broker_port = 5567 } = this.config;
this.dealerSocket = new zmq.Dealer();
const endpoint = `tcp://${flink_hostname}:${ingestor_broker_port}`;
await this.dealerSocket.connect(endpoint);
this.logger.info(`Connected DEALER to Flink IngestorBroker at ${endpoint}`);
// Register as available
await this.sendReady();
// Periodically re-send WorkerReady when idle, to recover from missed initial registration
this._idleHeartbeatInterval = setInterval(() => {
if (this.activeJobId === null && !this.isShutdown) {
this.sendReady().catch(err =>
this.logger.warn({ error: err.message }, 'Failed to re-send WorkerReady'));
// Subscribe to connection events BEFORE calling connect() so we catch the
// initial establishment. The 'connect' event fires on initial TCP handshake
// and again after every ZMQ reconnect (e.g. Flink restart).
this.dealerSocket.events.on('connect', ({ address }) => {
this.logger.info({ address }, 'DEALER connected to broker');
if (this.onConnected) {
this.onConnected().catch(err =>
this.logger.error({ error: err.message }, 'onConnected callback failed'));
}
}, 30_000);
});
const endpoint = `tcp://${flink_hostname}:${ingestor_broker_port}`;
this.dealerSocket.connect(endpoint);
this.logger.info(`Connecting DEALER to Flink IngestorBroker at ${endpoint}`);
// Start receiving work in background
this._receiveLoop();
}
/**
* Send WorkerReady — called on connect and after each COMPLETE.
* Send one typed WorkerReady slot offer.
* @param {string} exchange - Exchange name (e.g. 'BINANCE')
* @param {number} slotType - SlotType enum value (0=ANY, 1=HISTORICAL, 2=REALTIME)
*/
async sendReady() {
async sendTypedReady(exchange, slotType) {
const frames = encodeBrokerMessage(
MessageTypeId.WORKER_READY,
{ exchanges: this.supportedExchanges },
{ exchanges: [exchange], jobType: slotType },
WorkerReady
);
await this.dealerSocket.send(frames);
this.logger.info({ exchanges: this.supportedExchanges }, 'Sent WorkerReady');
this.logger.debug({ exchange, slotType }, 'Sent WorkerReady slot offer');
}
/**
* Send WorkComplete after a historical job finishes.
* Automatically sends WorkerReady so Flink returns us to the free pool.
* Slot re-registration is handled by SlotPool after this call.
*/
async sendComplete(jobId, success, errorMessage) {
this.activeJobId = null;
const frames = encodeBrokerMessage(
MessageTypeId.WORK_COMPLETE,
{
@@ -96,9 +94,6 @@ export class ZmqClient {
);
await this.dealerSocket.send(frames);
this.logger.info({ jobId, success }, 'Sent WorkComplete');
// Return to free pool
await this.sendReady();
}
/**
@@ -153,12 +148,10 @@ export class ZmqClient {
const payload = frames[2].slice(1);
if (typeId === MessageTypeId.WORK_ASSIGN) {
// DataRequest protobuf
const request = DataRequest.decode(payload);
const req = DataRequest.toObject(request, {
longs: String, enums: String, bytes: Buffer
});
this.activeJobId = req.jobId;
this.logger.info(
{ jobId: req.jobId, requestId: req.requestId, type: req.type, ticker: req.ticker },
'Received WorkAssign from broker'
@@ -192,10 +185,6 @@ export class ZmqClient {
async shutdown() {
this.isShutdown = true;
if (this._idleHeartbeatInterval) {
clearInterval(this._idleHeartbeatInterval);
this._idleHeartbeatInterval = null;
}
this.logger.info('Shutting down ZMQ DEALER connection');
if (this.dealerSocket) {
this.dealerSocket.close();

View File

@@ -333,12 +333,27 @@ message FieldValue {
// ─── Ingestor Broker Protocol (Flink ROUTER ↔ Ingestor DEALER, port 5567) ───
// Message type IDs 0x200x25
//
// Capacity model: each WorkerReady is ONE slot offer for a specific exchange
// and job type. The ingestor sends N WorkerReady messages at startup (one per
// available slot) and re-sends one after each job completes, subject to any
// rate-limit backoff.
// Ingestor → Flink: register as available (type 0x20)
// Sent on DEALER connect and after every COMPLETE.
// Job type for a slot offer or assignment.
enum SlotType {
ANY = 0; // accepts any job type
HISTORICAL = 1; // historical OHLC fetch slot
REALTIME = 2; // realtime tick subscription slot
}
// Ingestor → Flink: offer one work slot (type 0x20)
// Sent once per available slot at startup and after each job completes.
// One WorkerReady = one slot for one exchange and one job type.
message WorkerReady {
// Exchanges this ingestor supports (e.g. ["BINANCE", "COINBASE"])
// Exchange this slot handles (single entry, e.g. ["BINANCE"])
repeated string exchanges = 1;
// Job type this slot accepts
SlotType job_type = 2;
}
// Ingestor → Flink: historical job finished (type 0x21)

View File

@@ -510,3 +510,44 @@ def sync_packages(data_dir: Path, environment_yml: Optional[Path] = None) -> dic
log.info(f"Conda package sync complete: {len(result['removed'])} packages removed")
return result
# =============================================================================
# Async wrappers — non-blocking equivalents for use from asyncio contexts
# =============================================================================
import asyncio as _asyncio
async def get_installed_packages_async() -> Set[str]:
"""Non-blocking wrapper around get_installed_packages()."""
return await _asyncio.to_thread(get_installed_packages)
async def install_packages_async(
packages: list[str],
data_dir: Optional[Path] = None,
) -> dict:
"""Non-blocking wrapper around install_packages()."""
return await _asyncio.to_thread(install_packages, packages, data_dir)
async def remove_packages_async(packages: list[str]) -> dict:
"""Non-blocking wrapper around remove_packages()."""
return await _asyncio.to_thread(remove_packages, packages)
async def cleanup_extra_packages_async(
data_dir: Path,
environment_yml: Optional[Path] = None,
) -> dict:
"""Non-blocking wrapper around cleanup_extra_packages()."""
return await _asyncio.to_thread(cleanup_extra_packages, data_dir, environment_yml)
async def sync_packages_async(
data_dir: Path,
environment_yml: Optional[Path] = None,
) -> dict:
"""Non-blocking wrapper around sync_packages()."""
return await _asyncio.to_thread(sync_packages, data_dir, environment_yml)

View File

@@ -0,0 +1,54 @@
"""
Thread-safe asyncio.run() for the sandbox.
Installs a global replacement for asyncio.run() that, when called from a
non-async thread while uvicorn's event loop is running, dispatches the
coroutine to that loop via run_coroutine_threadsafe(). The calling thread
blocks on future.result() — releasing the GIL — so uvicorn's loop runs
freely (health checks, MCP requests, etc.).
Usage:
from dexorder.event_loop import install_thread_safe_asyncio_run
install_thread_safe_asyncio_run(asyncio.get_running_loop()) # call once at startup
"""
import asyncio
import logging
log = logging.getLogger(__name__)
_main_loop: asyncio.AbstractEventLoop | None = None
_original_asyncio_run = asyncio.run
def install_thread_safe_asyncio_run(loop: asyncio.AbstractEventLoop) -> None:
"""
Patch asyncio.run globally to cooperate with uvicorn's event loop.
Call once from the lifespan startup (main thread, loop already running).
"""
global _main_loop
_main_loop = loop
def _thread_safe_run(coro, *, debug=None):
# Detect if we're in a thread (no running loop in this thread)
try:
asyncio.get_running_loop()
# We're already inside an async context — asyncio.run() is not
# valid here regardless; let it raise the normal error.
raise RuntimeError(
"asyncio.run() cannot be called when another event loop is running "
"in the same thread."
)
except RuntimeError as exc:
if "cannot be called" in str(exc):
raise
# No running loop in this thread — safe to dispatch to main loop.
if _main_loop is not None and _main_loop.is_running():
log.debug("asyncio.run() from thread → run_coroutine_threadsafe")
return asyncio.run_coroutine_threadsafe(coro, _main_loop).result()
# Fallback: main loop not available (e.g., called before startup or in tests)
return _original_asyncio_run(coro, debug=debug)
asyncio.run = _thread_safe_run
log.info("Installed thread-safe asyncio.run()")

View File

@@ -5,6 +5,8 @@ Tickers use Nautilus format: "BTC/USDT.BINANCE"
All timestamps are nanoseconds since epoch.
"""
import tracemalloc
from pathlib import Path
from typing import Optional, List, Tuple
import pandas as pd
import logging
@@ -19,6 +21,19 @@ from pyiceberg.expressions import (
log = logging.getLogger(__name__)
def _rss_mb() -> str:
"""Return current VmRSS and VmPeak from /proc/self/status as a short string."""
try:
info = {}
for line in Path("/proc/self/status").read_text().splitlines():
for key in ("VmRSS", "VmPeak", "VmSize"):
if line.startswith(f"{key}:"):
info[key] = int(line.split()[1]) // 1024 # kB → MB
return f"RSS={info.get('VmRSS','?')}MB peak={info.get('VmPeak','?')}MB virt={info.get('VmSize','?')}MB"
except Exception:
return "?"
class IcebergClient:
"""
Client for querying OHLC data from Iceberg warehouse (Iceberg 1.10.1).
@@ -114,8 +129,21 @@ class IcebergClient:
if fetch_columns is not None:
scan = scan.select(*fetch_columns)
if not tracemalloc.is_tracing():
tracemalloc.start()
tm_before = tracemalloc.take_snapshot()
log.info("MEM before scan.to_pandas(): %s", _rss_mb())
df = scan.to_pandas()
log.info("MEM after scan.to_pandas(): %s | rows=%d cols=%s mem=%dMB",
_rss_mb(), len(df), list(df.columns),
df.memory_usage(deep=True).sum() // (1024 * 1024))
tm_after = tracemalloc.take_snapshot()
top = tm_after.compare_to(tm_before, "lineno")
for stat in top[:5]:
log.info("TRACEMALLOC: %s", stat)
if not df.empty:
# Deduplicate: keep the most-recently-ingested row per timestamp.
if "ingested_at" in df.columns:
@@ -123,6 +151,7 @@ class IcebergClient:
df.sort_values("ingested_at", ascending=False)
.drop_duplicates(subset=["timestamp"])
)
log.info("MEM after dedup: %s | rows=%d", _rss_mb(), len(df))
# Drop ingested_at if the caller did not ask for it
if columns is not None and "ingested_at" not in columns and "ingested_at" in df.columns:
df = df.drop(columns=["ingested_at"])

View File

@@ -0,0 +1,85 @@
"""
Memory guard for sandbox containers.
Sets a soft RLIMIT_AS limit derived from the cgroup memory limit at a
configurable fraction, so Python raises MemoryError before the kernel's
OOM killer fires. The MCP session survives; only the tool call fails.
"""
import gc
import logging
import resource
from pathlib import Path
log = logging.getLogger(__name__)
def _read_cgroup_limit_bytes() -> int | None:
"""Read container memory.max from cgroup v2. Returns bytes or None."""
try:
val = Path("/sys/fs/cgroup/memory.max").read_text().strip()
if val == "max":
return None
return int(val)
except Exception:
return None
def setup_memory_limit(fraction: float) -> None:
"""
Set RLIMIT_AS soft limit to baseline VmSize + allowed growth.
RLIMIT_AS caps total virtual address space, which includes shared libraries
and memory-mapped files that don't consume physical RAM. The baseline VmSize
at startup can be 3+ GB even when RSS is only ~200 MB. Setting the limit to
a flat cgroup fraction would crash immediately.
Instead: limit = current VmSize + (cgroup_limit * fraction)
This allows `fraction` worth of new allocations (numpy arrays, pandas
dataframes, etc.) above the startup baseline before raising MemoryError.
Args:
fraction: Proportion of cgroup memory.max to allow as new growth, e.g. 0.85.
"""
cgroup_bytes = _read_cgroup_limit_bytes()
# Read baseline VmSize (total virtual address space at startup)
vmsize_bytes: int | None = None
try:
for line in Path("/proc/self/status").read_text().splitlines():
if line.startswith("VmSize:"):
vmsize_bytes = int(line.split()[1]) * 1024 # kB → bytes
log.info("Memory baseline: %s", line.strip())
elif line.startswith("VmRSS:"):
log.info("Memory baseline: %s", line.strip())
except Exception:
pass
if cgroup_bytes is None:
log.warning("cgroup memory.max is unlimited; RLIMIT_AS not set")
return
allowed_growth_bytes = int(cgroup_bytes * fraction)
baseline = vmsize_bytes or 0
limit_bytes = baseline + allowed_growth_bytes
_, hard = resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, hard))
log.info(
"RLIMIT_AS soft limit set to %d MB (baseline %d MB + allowed growth %d MB, %.0f%% of cgroup %d MB)",
limit_bytes // (1024 * 1024),
baseline // (1024 * 1024),
allowed_growth_bytes // (1024 * 1024),
fraction * 100,
cgroup_bytes // (1024 * 1024),
)
def cleanup_memory() -> None:
"""
Called after a MemoryError is caught in a tool execution thread.
Runs gc.collect() to free objects held by the failed script.
Hook here for future recovery strategies (cache eviction, etc.).
"""
log.warning("MemoryError in tool thread — running gc.collect()")
gc.collect()

View File

@@ -0,0 +1,186 @@
#!/usr/bin/env python3
"""
backtest_harness — runs a strategy backtest as a subprocess.
Reads a JSON config from stdin:
{
"strategy_name": str,
"feeds": [{"symbol": str, "period_seconds": int}, ...],
"from_time": ...,
"to_time": ...,
"initial_capital": float,
"paper": bool
}
Outputs JSON to stdout on success:
{
"strategy_name": str,
"feeds": [...],
"initial_capital": float,
"paper": bool,
"total_candles": int,
... (metrics from run_backtest)
}
On error:
{"error": str}
"""
import asyncio
import json
import os
import sys
import traceback
from pathlib import Path
# Ensure dexorder package is importable when run as a subprocess
sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
_OHLC_EXTRA_COLUMNS = [
"volume", "buy_vol", "sell_vol",
"open_time", "high_time", "low_time", "close_time",
"open_interest",
]
async def _run(cfg: dict) -> dict:
strategy_name = cfg["strategy_name"]
feeds = cfg["feeds"]
from_time = cfg.get("from_time")
to_time = cfg.get("to_time")
initial_capital = float(cfg.get("initial_capital", 10_000.0))
paper = bool(cfg.get("paper", True))
# -------------------------------------------------------------------------
# Initialize API
# -------------------------------------------------------------------------
try:
import yaml
config_path = os.environ.get("CONFIG_PATH", "/app/config/config.yaml")
secrets_path = os.environ.get("SECRETS_PATH", "/app/config/secrets.yaml")
config_data = {}
secrets_data = {}
if Path(config_path).exists():
with open(config_path) as f:
config_data = yaml.safe_load(f) or {}
if Path(secrets_path).exists():
with open(secrets_path) as f:
secrets_data = yaml.safe_load(f) or {}
data_cfg = config_data.get("data", {})
iceberg_cfg = data_cfg.get("iceberg", {})
relay_cfg = data_cfg.get("relay", {})
from dexorder.api import set_api, API
from dexorder.impl.charting_api_impl import ChartingAPIImpl
from dexorder.impl.data_api_impl import DataAPIImpl
data_api = DataAPIImpl(
iceberg_catalog_uri=iceberg_cfg.get("catalog_uri", "http://iceberg-catalog:8181"),
relay_endpoint=relay_cfg.get("endpoint", "tcp://relay:5559"),
notification_endpoint=relay_cfg.get("notification_endpoint", "tcp://relay:5558"),
namespace=iceberg_cfg.get("namespace", "trading"),
s3_endpoint=iceberg_cfg.get("s3_endpoint") or secrets_data.get("s3_endpoint"),
s3_access_key=iceberg_cfg.get("s3_access_key") or secrets_data.get("s3_access_key"),
s3_secret_key=iceberg_cfg.get("s3_secret_key") or secrets_data.get("s3_secret_key"),
s3_region=iceberg_cfg.get("s3_region") or secrets_data.get("s3_region"),
request_timeout=240.0,
)
set_api(API(charting=ChartingAPIImpl(), data=data_api))
except Exception as e:
return {"error": f"API initialization failed: {e}"}
# -------------------------------------------------------------------------
# Locate strategy
# -------------------------------------------------------------------------
data_dir = Path(os.environ.get("DATA_DIR", "/app/data"))
try:
from dexorder.tools.python_tools import get_category_manager, sanitize_name
category_manager = get_category_manager(data_dir)
safe_name = sanitize_name(strategy_name)
impl_path = category_manager.src_dir / "strategy" / safe_name / "implementation.py"
if not impl_path.exists():
return {"error": f"Strategy '{strategy_name}' not found (looked at {impl_path})"}
except Exception as exc:
return {"error": f"Failed to locate strategy: {exc}"}
# -------------------------------------------------------------------------
# Register custom indicators and load strategy class
# -------------------------------------------------------------------------
try:
from dexorder.nautilus.backtest_runner import _setup_custom_indicators
_setup_custom_indicators(category_manager.src_dir)
except Exception as exc:
sys.stderr.write(f"WARNING: custom indicator setup failed: {exc}\n")
try:
from dexorder.nautilus.backtest_runner import _load_strategy_class
strategy_class = _load_strategy_class(impl_path)
except Exception:
return {"error": f"Strategy load failed:\n{traceback.format_exc()}"}
# -------------------------------------------------------------------------
# Fetch OHLC data
# -------------------------------------------------------------------------
from dexorder.api import get_api
from dexorder.nautilus.pandas_strategy import make_feed_key
api = get_api()
parsed_feeds = [(f["symbol"], int(f["period_seconds"])) for f in feeds]
ohlc_dfs = {}
total_candles = 0
for ticker, period_seconds in parsed_feeds:
feed_key = make_feed_key(ticker, period_seconds)
try:
df = await api.data.historical_ohlc(
ticker=ticker,
period_seconds=period_seconds,
start_time=from_time,
end_time=to_time,
extra_columns=_OHLC_EXTRA_COLUMNS,
)
except Exception as exc:
return {"error": f"OHLC fetch failed for {feed_key}: {exc}"}
if df.empty:
return {"error": f"No OHLC data for {feed_key} in the requested range"}
ohlc_dfs[feed_key] = df
total_candles += len(df)
# -------------------------------------------------------------------------
# Run backtest (synchronous)
# -------------------------------------------------------------------------
try:
from dexorder.nautilus.backtest_runner import run_backtest
metrics = run_backtest(
strategy_class=strategy_class,
feeds=parsed_feeds,
ohlc_dfs=ohlc_dfs,
initial_capital=initial_capital,
paper=paper,
)
except Exception:
return {"error": f"Backtest failed:\n{traceback.format_exc()}"}
return {
"strategy_name": strategy_name,
"feeds": [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
"initial_capital": initial_capital,
"paper": paper,
"total_candles": total_candles,
**metrics,
}
def main():
cfg = json.loads(sys.stdin.read())
result = asyncio.run(_run(cfg))
print(json.dumps(result))
if __name__ == "__main__":
main()

View File

@@ -1,25 +1,21 @@
"""
backtest_strategy — run a PandasStrategy against historical OHLC data.
Called directly from the MCP server's async handle_tool_call.
Returns a JSON payload with backtest metrics and equity curve, following the
same pattern as evaluate_indicator.py.
Spawns backtest_harness.py as a subprocess so user strategy code is isolated
from the MCP server process. The harness handles API init, data fetch, and
the synchronous BacktestEngine internally.
"""
import asyncio
import json
import logging
import sys
from pathlib import Path
from typing import Any
log = logging.getLogger(__name__)
# All OHLC+ columns to request from the DataAPI
_OHLC_EXTRA_COLUMNS = [
"volume", "buy_vol", "sell_vol",
"open_time", "high_time", "low_time", "close_time",
"open_interest",
]
_BACKTEST_HARNESS = Path(__file__).parent / "backtest_harness.py"
async def backtest_strategy(
@@ -42,23 +38,8 @@ async def backtest_strategy(
paper: Always True for historical backtest (flag reserved for forward testing)
Returns:
list[TextContent] with JSON payload:
{
"strategy_name": str,
"feeds": [...],
"initial_capital": float,
"paper": bool,
"total_candles": int,
"total_return": float, # fractional (0.15 = +15%)
"sharpe_ratio": float,
"max_drawdown": float, # fractional (0.10 = 10% drawdown)
"win_rate": float,
"trade_count": int,
"equity_curve": [{"timestamp": int, "equity": float}, ...]
}
On error:
{"error": str}
list[TextContent] with JSON payload containing backtest metrics.
On error: [TextContent] with {"error": str}
"""
from mcp.types import TextContent
@@ -66,102 +47,52 @@ async def backtest_strategy(
log.error("backtest_strategy '%s': %s", strategy_name, msg)
return [TextContent(type="text", text=json.dumps({"error": msg}))]
# --- 1. Validate feeds input ---
if not feeds:
return _err("feeds list is empty — provide at least one {symbol, period_seconds} entry")
parsed_feeds: list[tuple[str, int]] = []
for f in feeds:
sym = f.get("symbol", "")
ps = f.get("period_seconds", 3600)
if not sym:
if not f.get("symbol"):
return _err(f"Feed entry missing 'symbol': {f}")
parsed_feeds.append((sym, int(ps)))
# --- 2. Resolve strategy implementation file ---
try:
from dexorder.tools.python_tools import get_category_manager, sanitize_name
category_manager = get_category_manager()
safe_name = sanitize_name(strategy_name)
impl_path = category_manager.src_dir / "strategy" / safe_name / "implementation.py"
if not impl_path.exists():
return _err(f"Strategy '{strategy_name}' not found (looked at {impl_path})")
except Exception as exc:
return _err(f"Failed to locate strategy: {exc}")
# --- 3. Register custom indicators with pandas-ta ---
try:
from dexorder.nautilus.backtest_runner import _setup_custom_indicators
_setup_custom_indicators(category_manager.src_dir)
except Exception as exc:
log.warning("backtest_strategy: custom indicator setup failed: %s", exc)
# --- 4. Load strategy class ---
try:
from dexorder.nautilus.backtest_runner import _load_strategy_class
strategy_class = _load_strategy_class(impl_path)
except Exception as exc:
log.exception("backtest_strategy: strategy load failed")
return _err(f"Strategy load failed: {exc}")
# --- 5. Fetch OHLC+ data for each feed ---
try:
from dexorder.api import get_api
api = get_api()
except Exception as exc:
return _err(f"API not available: {exc}")
ohlc_dfs: dict[str, Any] = {}
total_candles = 0
for ticker, period_seconds in parsed_feeds:
from dexorder.nautilus.pandas_strategy import make_feed_key
feed_key = make_feed_key(ticker, period_seconds)
try:
df = await api.data.historical_ohlc(
ticker=ticker,
period_seconds=period_seconds,
start_time=from_time,
end_time=to_time,
extra_columns=_OHLC_EXTRA_COLUMNS,
)
except Exception as exc:
log.exception("backtest_strategy: OHLC fetch failed for %s", feed_key)
return _err(f"OHLC fetch failed for {feed_key}: {exc}")
if df.empty:
return _err(f"No OHLC data for {feed_key} in the requested range")
ohlc_dfs[feed_key] = df
total_candles += len(df)
# --- 6. Run backtest in thread executor (BacktestEngine is synchronous) ---
try:
import asyncio
from dexorder.nautilus.backtest_runner import run_backtest
loop = asyncio.get_event_loop()
metrics = await loop.run_in_executor(
None,
lambda: run_backtest(
strategy_class=strategy_class,
feeds=parsed_feeds,
ohlc_dfs=ohlc_dfs,
initial_capital=initial_capital,
paper=paper,
),
)
except Exception as exc:
log.exception("backtest_strategy: backtest run failed")
return _err(f"Backtest failed: {exc}")
# --- 7. Return results ---
payload = {
"strategy_name": strategy_name,
"feeds": [{"symbol": t, "period_seconds": p} for t, p in parsed_feeds],
cfg = {
"strategy_name": strategy_name,
"feeds": feeds,
"from_time": from_time,
"to_time": to_time,
"initial_capital": initial_capital,
"paper": paper,
"total_candles": total_candles,
**metrics, # keys: summary, statistics, trades, equity_curve
"paper": paper,
}
try:
proc = await asyncio.create_subprocess_exec(
sys.executable, str(_BACKTEST_HARNESS),
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await asyncio.wait_for(
proc.communicate(json.dumps(cfg).encode()),
timeout=600,
)
except asyncio.TimeoutError:
return _err("Backtest timed out (10 minutes)")
except Exception as exc:
return _err(f"Failed to launch backtest harness: {exc}")
if proc.returncode != 0:
err_text = stderr.decode(errors="replace")
log.error("backtest_strategy '%s': harness exited %d: %s", strategy_name, proc.returncode, err_text[:500])
return _err(f"Backtest harness failed:\n{err_text}")
if stderr:
log.warning("backtest_strategy '%s' stderr: %s", strategy_name, stderr.decode(errors="replace")[:500])
try:
payload = json.loads(stdout.decode())
except json.JSONDecodeError:
return _err(f"Harness produced invalid JSON: {stdout.decode(errors='replace')[:200]}")
if "error" in payload:
return _err(payload["error"])
return [TextContent(type="text", text=json.dumps(payload))]

View File

@@ -18,51 +18,32 @@ After write/edit operations, a category-specific test harness runs to validate
the code and capture errors/output for agent feedback.
"""
import concurrent.futures
import json
import logging
import re
import subprocess
import sys
import traceback
from dataclasses import dataclass, asdict
from enum import Enum
from pathlib import Path
from typing import Any, Optional
from dexorder.tools.subprocess_runner import run_subprocess_argv, run_in_thread
log = logging.getLogger(__name__)
def _run_inprocess(fn, *args, timeout: int) -> dict:
"""
Run fn(*args) in a one-shot thread and return its result dict.
Uses a thread so the calling coroutine is not blocked and the calling
process does not fork a new Python interpreter. All already-loaded
libraries (numpy, pandas, matplotlib, etc.) are shared with the thread.
On timeout returns a dict with _timeout=True. On unexpected exception
returns a dict with error=True and the traceback in stderr.
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(fn, *args)
try:
return future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
return {"_timeout": True, "error": True,
"stdout": "", "stderr": "", "images": []}
except Exception:
return {"error": True, "stdout": "",
"stderr": traceback.format_exc(), "images": []}
# Paths to harness scripts run as subprocesses
_RESEARCH_HARNESS = Path(__file__).parent / "research_harness.py"
_STRATEGY_HARNESS = Path(__file__).parent / "strategy_harness.py"
# Import conda manager for package installation and tracking
try:
from dexorder.conda_manager import install_packages, cleanup_extra_packages
from dexorder.conda_manager import install_packages_async, cleanup_extra_packages_async
except ImportError:
log.warning("conda_manager not available - package installation disabled")
install_packages = None
cleanup_extra_packages = None
install_packages_async = None
cleanup_extra_packages_async = None
# =============================================================================
@@ -355,6 +336,39 @@ class GitManager:
except Exception:
pass
# ------------------------------------------------------------------
# Async variants — delegates to sync methods via asyncio.to_thread
# so the event loop stays responsive during git operations.
# ------------------------------------------------------------------
async def commit_async(self, message: str) -> Optional[str]:
import asyncio
return await asyncio.to_thread(self.commit, message)
async def log_async(self, path: Optional[Path] = None, n: int = 20) -> list[dict]:
import asyncio
return await asyncio.to_thread(self.log, path, n)
async def restore_async(self, revision: str, path: Optional[Path] = None) -> Optional[str]:
import asyncio
return await asyncio.to_thread(self.restore, revision, path)
async def head_short_hash_async(self) -> str:
import asyncio
return await asyncio.to_thread(self.head_short_hash)
async def create_worktree_async(self, worktree_path: Path, revision: str = "HEAD") -> str:
import asyncio
return await asyncio.to_thread(self.create_worktree, worktree_path, revision)
async def remove_worktree_async(self, worktree_path: Path) -> None:
import asyncio
return await asyncio.to_thread(self.remove_worktree, worktree_path)
async def prune_worktrees_async(self) -> None:
import asyncio
return await asyncio.to_thread(self.prune_worktrees)
# =============================================================================
# Custom Indicator Setup
@@ -484,7 +498,7 @@ class CategoryFileManager:
"""Root of the versioned category code (git repo root)."""
return self.data_dir / "src"
def write(
async def write(
self,
category: str,
name: str,
@@ -547,7 +561,7 @@ class CategoryFileManager:
return {"success": False, "error": f"Failed to write metadata: {e}"}
# Run validation harness
validation = self._validate(cat, item_dir)
validation = await self._validate(cat, item_dir)
result = {
"success": validation["success"],
@@ -559,19 +573,19 @@ class CategoryFileManager:
if validation["success"]:
if cat == Category.RESEARCH:
log.info(f"Auto-executing research script: {name}")
result["execution"] = self.execute_research(name)
result["execution"] = await self.execute_research(name)
elif cat == Category.INDICATOR:
log.info(f"Auto-executing indicator test: {name}")
result["execution"] = self._execute_indicator(item_dir)
result["execution"] = await self._execute_indicator(item_dir)
# Commit to git
commit_hash = self.git.commit(f"create({category}): {name}")
commit_hash = await self.git.commit_async(f"create({category}): {name}")
if commit_hash:
result["revision"] = commit_hash
return result
def edit(
async def edit(
self,
category: str,
name: str,
@@ -671,7 +685,7 @@ class CategoryFileManager:
# Run validation harness if code was updated
validation = None
if code is not None:
validation = self._validate(cat, item_dir)
validation = await self._validate(cat, item_dir)
result = {
"success": True,
@@ -685,15 +699,15 @@ class CategoryFileManager:
if code is not None and result["success"]:
if cat == Category.RESEARCH:
log.info(f"Auto-executing research script after edit: {name}")
result["execution"] = self.execute_research(name)
result["execution"] = await self.execute_research(name)
elif cat == Category.INDICATOR:
log.info(f"Auto-executing indicator test after edit: {name}")
result["execution"] = self._execute_indicator(item_dir)
result["execution"] = await self._execute_indicator(item_dir)
# Commit to git if code changed
if code is not None and result["success"]:
action = "patch" if patches is not None else "edit"
commit_hash = self.git.commit(f"{action}({category}): {name}")
commit_hash = await self.git.commit_async(f"{action}({category}): {name}")
if commit_hash:
result["revision"] = commit_hash
@@ -776,7 +790,7 @@ class CategoryFileManager:
return {"items": items}
def _validate(self, category: Category, item_dir: Path) -> dict[str, Any]:
async def _validate(self, category: Category, item_dir: Path) -> dict[str, Any]:
"""
Run category-specific validation harness.
@@ -793,13 +807,13 @@ class CategoryFileManager:
# Install required packages before validation
packages_installed = []
if install_packages and meta_path.exists():
if install_packages_async and meta_path.exists():
try:
metadata = json.loads(meta_path.read_text())
conda_packages = metadata.get("conda_packages", [])
if conda_packages:
log.info(f"Installing packages for validation: {conda_packages}")
install_result = install_packages(conda_packages, data_dir=self.data_dir)
install_result = await install_packages_async(conda_packages, data_dir=self.data_dir)
if install_result.get("success"):
packages_installed = install_result.get("installed", [])
if packages_installed:
@@ -811,11 +825,11 @@ class CategoryFileManager:
# Run validation
if category == Category.STRATEGY:
result = self._validate_strategy(impl_path)
result = await self._validate_strategy(impl_path)
elif category == Category.INDICATOR:
result = self._validate_indicator(impl_path)
result = await self._validate_indicator(impl_path)
elif category == Category.RESEARCH:
result = self._validate_research(impl_path, item_dir)
result = await self._validate_research(impl_path, item_dir)
else:
result = {"success": False, "error": f"No validator for category {category}"}
@@ -825,19 +839,18 @@ class CategoryFileManager:
return result
def _validate_strategy(self, impl_path: Path) -> dict[str, Any]:
async def _validate_strategy(self, impl_path: Path) -> dict[str, Any]:
"""
Validate a strategy by running it against synthetic OHLC data.
Runs strategy_harness.py in-process via a thread. Catches import errors,
Runs strategy_harness.py as a subprocess. Catches import errors,
runtime errors in evaluate(), and wrong class hierarchy — not just syntax.
"""
meta_path = impl_path.parent / "metadata.json"
return self._execute_strategy(impl_path.parent, timeout=45)
return await self._execute_strategy(impl_path.parent, timeout=45)
def _execute_strategy(self, item_dir: Path, timeout: int = 45) -> dict[str, Any]:
async def _execute_strategy(self, item_dir: Path, timeout: int = 45) -> dict[str, Any]:
"""
Run a strategy against synthetic OHLC data in-process via a thread.
Run a strategy against synthetic OHLC data via strategy_harness.py subprocess.
Returns:
dict with success, output (human-readable summary), trade_count, error
@@ -850,24 +863,26 @@ class CategoryFileManager:
if not meta_path.exists():
return {"success": False, "error": "metadata.json not found"}
from dexorder.tools.strategy_harness import run as _strategy_run
result = _run_inprocess(_strategy_run, impl_path, meta_path, timeout=timeout)
if result.get("_timeout"):
data = await run_subprocess_argv(
sys.executable, str(_STRATEGY_HARNESS), str(impl_path), str(meta_path),
timeout=timeout,
)
if data.get("_timeout"):
return {"success": False, "error": f"Strategy test timed out after {timeout}s"}
return result
if data.get("error") and not data.get("success"):
return {"success": False, "error": data.get("stderr") or "Harness failed"}
return data
def _validate_indicator(self, impl_path: Path) -> dict[str, Any]:
async def _validate_indicator(self, impl_path: Path) -> dict[str, Any]:
"""
Validate an indicator by running it against synthetic OHLC data.
Runs indicator_harness.py in-process via a thread. Catches import errors,
runtime errors, and wrong return types — not just syntax.
Runs indicator_harness.py in-process via a thread (main proc). Catches
import errors, runtime errors, and wrong return types — not just syntax.
"""
meta_path = impl_path.parent / "metadata.json"
return self._execute_indicator(impl_path.parent, timeout=30)
return await self._execute_indicator(impl_path.parent, timeout=30)
def _execute_indicator(self, item_dir: Path, timeout: int = 30) -> dict[str, Any]:
async def _execute_indicator(self, item_dir: Path, timeout: int = 30) -> dict[str, Any]:
"""
Run an indicator against synthetic OHLC data in-process via a thread.
@@ -883,29 +898,32 @@ class CategoryFileManager:
return {"success": False, "error": "metadata.json not found"}
from dexorder.tools.indicator_harness import run as _indicator_run
result = _run_inprocess(_indicator_run, impl_path, meta_path, timeout=timeout)
result = await run_in_thread(_indicator_run, impl_path, meta_path, timeout=timeout)
if result.get("_timeout"):
return {"success": False, "error": f"Indicator test timed out after {timeout}s"}
return result
def _run_research_harness(self, impl_path: Path, item_dir: Path, timeout: int = 300) -> dict[str, Any]:
async def _run_research_harness(self, impl_path: Path, item_dir: Path, timeout: int = 300) -> dict[str, Any]:
"""
Run a research script in-process via a thread and return captured results.
Run a research script via research_harness.py subprocess and return captured results.
Returns:
dict with stdout, stderr, images, error fields — or an error dict.
dict with stdout, stderr, images, error fields.
"""
from dexorder.tools.research_harness import run as _research_run
return _run_inprocess(_research_run, impl_path, item_dir, timeout=timeout)
return await run_subprocess_argv(
sys.executable, str(_RESEARCH_HARNESS), str(impl_path),
timeout=timeout,
cwd=item_dir,
)
def _validate_research(self, impl_path: Path, item_dir: Path) -> dict[str, Any]:
async def _validate_research(self, impl_path: Path, item_dir: Path) -> dict[str, Any]:
"""
Validate a research script.
Runs the script via the harness and captures output + pyplot images.
"""
data = self._run_research_harness(impl_path, item_dir, timeout=300)
data = await self._run_research_harness(impl_path, item_dir, timeout=300)
if data.get("_timeout"):
return {"success": False, "error": "Research script timeout"}
@@ -923,7 +941,7 @@ class CategoryFileManager:
"images": data["images"],
}
def execute_research(self, name: str) -> dict[str, Any]:
async def execute_research(self, name: str) -> dict[str, Any]:
"""
Execute a research script and return structured content with images.
@@ -944,7 +962,7 @@ class CategoryFileManager:
if not impl_path.exists():
return {"error": f"Implementation file not found for '{name}'"}
data = self._run_research_harness(impl_path, item_dir, timeout=300)
data = await self._run_research_harness(impl_path, item_dir, timeout=300)
if data.get("_timeout"):
log.error(f"execute_research '{name}': timeout")
@@ -995,7 +1013,7 @@ class CategoryFileManager:
return {"content": content}
def delete(self, category: str, name: str) -> dict[str, Any]:
async def delete(self, category: str, name: str) -> dict[str, Any]:
"""
Delete a category script directory and commit the removal to git.
@@ -1031,13 +1049,13 @@ class CategoryFileManager:
except Exception as e:
return {"success": False, "error": f"Failed to delete: {e}"}
commit_hash = self.git.commit(f"delete({category}): {name}")
commit_hash = await self.git.commit_async(f"delete({category}): {name}")
result: dict[str, Any] = {"success": True, "category": category, "name": name}
if commit_hash:
result["revision"] = commit_hash
return result
def git_log(
async def git_log(
self,
category: Optional[str] = None,
name: Optional[str] = None,
@@ -1061,10 +1079,10 @@ class CategoryFileManager:
path = get_category_path(self.src_dir, cat, name)
else:
path = self.src_dir / cat.value
entries = self.git.log(path=path, n=limit)
entries = await self.git.log_async(path=path, n=limit)
return {"success": True, "commits": entries}
def git_revert(self, revision: str, category: str, name: str) -> dict[str, Any]:
async def git_revert(self, revision: str, category: str, name: str) -> dict[str, Any]:
"""
Restore a category item to a previous git revision (creates a new commit).
@@ -1085,11 +1103,11 @@ class CategoryFileManager:
return {"success": False, "error": f"Item '{name}' not found in '{category}'"}
try:
commit_hash = self.git.restore(revision, path=item_dir)
commit_hash = await self.git.restore_async(revision, path=item_dir)
except RuntimeError as e:
return {"success": False, "error": str(e)}
validation = self._validate(cat, item_dir)
validation = await self._validate(cat, item_dir)
return {
"success": validation["success"],
"revision": commit_hash,

View File

@@ -119,11 +119,39 @@ def run(impl_path: Path, item_dir: Path) -> dict:
stdout_buf = io.StringIO()
stderr_buf = io.StringIO()
# Eagerly capture figures when user scripts call plt.close() so images are
# not lost even if the script closes figures immediately after savefig().
captured_images: list[dict] = []
def _capture_fig(fig) -> None:
buf = io.BytesIO()
fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
buf.seek(0)
captured_images.append({"format": "png", "data": base64.b64encode(buf.read()).decode('utf-8')})
buf.close()
_orig_plt_close = plt.close
def _patched_close(fig=None):
if fig is None:
for fn in plt.get_fignums():
_capture_fig(plt.figure(fn))
elif fig == 'all':
for fn in plt.get_fignums():
_capture_fig(plt.figure(fn))
else:
try:
_capture_fig(fig if hasattr(fig, 'savefig') else plt.figure(fig))
except Exception:
pass
_orig_plt_close(fig)
error_occurred = False
old_stdout, old_stderr = sys.stdout, sys.stderr
old_cwd = os.getcwd()
sys.stdout = stdout_buf
sys.stderr = stderr_buf
plt.close = _patched_close
try:
os.chdir(impl_path.parent)
@@ -136,22 +164,26 @@ def run(impl_path: Path, item_dir: Path) -> dict:
sys.stdout = old_stdout
sys.stderr = old_stderr
os.chdir(old_cwd)
plt.close = _orig_plt_close
stdout_output = stdout_buf.getvalue()
stderr_output = stderr_buf.getvalue()
# ---------------------------------------------------------------------------
# Capture matplotlib figures
# Capture any figures still open after script completion
# ---------------------------------------------------------------------------
images = []
images = captured_images
if not error_occurred:
already_seen = {img["data"] for img in images}
for fig_num in plt.get_fignums():
fig = plt.figure(fig_num)
buf = io.BytesIO()
fig.savefig(buf, format='png', dpi=100, bbox_inches='tight')
buf.seek(0)
images.append({"format": "png", "data": base64.b64encode(buf.read()).decode('utf-8')})
data = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
if data not in already_seen:
images.append({"format": "png", "data": data})
plt.close('all')
return {

View File

@@ -0,0 +1,182 @@
"""
subprocess_runner — non-blocking subprocess primitives for the MCP sandbox.
All three entrypoints return the same dict shape as the legacy _run_inprocess():
{
"error": bool,
"stdout": str,
"stderr": str,
"images": list, # always [] for non-research invocations
"_timeout": bool # present and True only on timeout
}
Callers can therefore pattern-match on {"_timeout", "error", "stdout", "stderr"}
uniformly regardless of whether the work ran in a subprocess or a thread.
"""
import asyncio
import json
import traceback
from pathlib import Path
from typing import Any, Callable
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _normalise(data: dict, stderr_fallback: str = "") -> dict:
"""Ensure the standard shape keys are present in a harness result dict."""
data.setdefault("error", False)
data.setdefault("stdout", "")
data.setdefault("stderr", stderr_fallback)
data.setdefault("images", [])
return data
def _err_dict(stderr: str = "", stdout: str = "") -> dict:
return {"error": True, "stdout": stdout, "stderr": stderr, "images": []}
def _timeout_dict() -> dict:
return {"_timeout": True, "error": True, "stdout": "", "stderr": "", "images": []}
# ---------------------------------------------------------------------------
# Primitive 1: run_subprocess_argv
#
# Non-blocking equivalent of:
# subprocess.run([sys.executable, harness, arg1, arg2, ...],
# capture_output=True, text=True, timeout=N, cwd=cwd)
#
# Used by: _execute_strategy, _run_research_harness
# ---------------------------------------------------------------------------
async def run_subprocess_argv(
*cmd: str,
timeout: int,
cwd: Path | None = None,
) -> dict:
"""
Spawn cmd as a subprocess, await completion, and return a normalised result dict.
stdout is expected to contain a JSON object written by the harness. It is
decoded and normalised to the standard shape. On JSON decode failure the
raw stdout text is preserved in "stdout" and error is set to True.
"""
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd=str(cwd) if cwd else None,
)
stdout_bytes, stderr_bytes = await asyncio.wait_for(
proc.communicate(), timeout=timeout
)
except asyncio.TimeoutError:
return _timeout_dict()
except Exception as exc:
return _err_dict(stderr=f"Harness launch failed: {exc}")
stdout_text = stdout_bytes.decode(errors="replace")
stderr_text = stderr_bytes.decode(errors="replace")
if proc.returncode != 0:
return _err_dict(
stderr=f"Harness exited {proc.returncode}:\n{stderr_text}",
stdout=stdout_text,
)
try:
data = json.loads(stdout_text)
return _normalise(data, stderr_fallback=stderr_text)
except json.JSONDecodeError:
return {"error": True, "stdout": stdout_text, "stderr": stderr_text, "images": []}
# ---------------------------------------------------------------------------
# Primitive 2: run_subprocess_stdin
#
# Non-blocking equivalent of the backtest pattern — JSON config fed via stdin.
# ---------------------------------------------------------------------------
async def run_subprocess_stdin(
*cmd: str,
stdin_data: bytes,
timeout: int,
) -> dict:
"""
Spawn cmd, write stdin_data to its stdin, await completion.
Returns the same normalised dict shape as run_subprocess_argv.
"""
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout_bytes, stderr_bytes = await asyncio.wait_for(
proc.communicate(stdin_data), timeout=timeout
)
except asyncio.TimeoutError:
return _timeout_dict()
except Exception as exc:
return _err_dict(stderr=f"Harness launch failed: {exc}")
stdout_text = stdout_bytes.decode(errors="replace")
stderr_text = stderr_bytes.decode(errors="replace")
if proc.returncode != 0:
return _err_dict(
stderr=f"Harness exited {proc.returncode}:\n{stderr_text}",
stdout=stdout_text,
)
try:
data = json.loads(stdout_text)
return _normalise(data, stderr_fallback=stderr_text)
except json.JSONDecodeError:
return {"error": True, "stdout": stdout_text, "stderr": stderr_text, "images": []}
# ---------------------------------------------------------------------------
# Primitive 3: run_in_thread
#
# Async wrapper around asyncio.to_thread so the event loop stays responsive
# while CPU-bound or blocking-IO callables run in a worker thread.
#
# Used by: _execute_indicator (in-process indicator harness)
# ---------------------------------------------------------------------------
async def run_in_thread(
fn: Callable,
*args: Any,
timeout: int,
) -> dict:
"""
Run fn(*args) in a thread pool worker and yield to the event loop while waiting.
On timeout the thread is abandoned (daemon) and _timeout_dict() is returned.
On MemoryError or unexpected exception a standard error dict is returned.
The returned dict is normalised to the standard shape.
"""
from dexorder.memory_guard import cleanup_memory
try:
result = await asyncio.wait_for(
asyncio.to_thread(fn, *args),
timeout=timeout,
)
return _normalise(result)
except asyncio.TimeoutError:
return _timeout_dict()
except MemoryError:
cleanup_memory()
return _err_dict(
stderr="Script exceeded memory limit. Try reducing the data range or batch size."
)
except Exception:
return _err_dict(stderr=traceback.format_exc())

View File

@@ -33,7 +33,7 @@ from starlette.routing import Route, Mount
from dexorder import EventPublisher, start_lifecycle_manager, get_lifecycle_manager
from dexorder.api import set_api, API
from dexorder.conda_manager import sync_packages, install_packages, cleanup_extra_packages
from dexorder.conda_manager import sync_packages_async, install_packages_async, cleanup_extra_packages_async
from dexorder.events import EventType, UserEvent, DeliverySpec
from dexorder.impl.charting_api_impl import ChartingAPIImpl
from dexorder.impl.data_api_impl import DataAPIImpl
@@ -893,7 +893,7 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
arguments.get("patch", [])
)
elif name == "python_write":
result = category_manager.write(
result = await category_manager.write(
category=arguments.get("category", ""),
name=arguments.get("name", ""),
description=arguments.get("description", ""),
@@ -920,10 +920,10 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
logging.info(f"python_write '{arguments.get('name')}': no execution result (category={arguments.get('category')})")
if result.get("success"):
_upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
cleanup_extra_packages(get_data_dir(), _get_env_yml())
await cleanup_extra_packages_async(get_data_dir(), _get_env_yml())
return content
elif name == "python_edit":
result = category_manager.edit(
result = await category_manager.edit(
category=arguments.get("category", ""),
name=arguments.get("name", ""),
code=arguments.get("code"),
@@ -951,7 +951,7 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
logging.info(f"python_edit '{arguments.get('name')}': no execution result")
if result.get("success"):
_upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
cleanup_extra_packages(get_data_dir(), _get_env_yml())
await cleanup_extra_packages_async(get_data_dir(), _get_env_yml())
return content
elif name == "python_read":
return category_manager.read(
@@ -963,7 +963,7 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
category=arguments.get("category", "")
)
elif name == "python_log":
result = category_manager.git_log(
result = await category_manager.git_log(
category=arguments.get("category"),
name=arguments.get("name"),
limit=int(arguments.get("limit", 20))
@@ -973,7 +973,7 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
lines.append(f"{c['short_hash']} {c['date'][:10]} {c['message']}")
return [TextContent(type="text", text="\n".join(lines))]
elif name == "python_revert":
result = category_manager.git_revert(
result = await category_manager.git_revert(
revision=arguments.get("revision", ""),
category=arguments.get("category", ""),
name=arguments.get("name", "")
@@ -989,13 +989,13 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
_upsert_type(workspace_store, category_manager, arguments.get("category", ""), arguments.get("name", ""))
return [TextContent(type="text", text="\n".join(meta_parts))]
elif name == "python_delete":
result = category_manager.delete(
result = await category_manager.delete(
category=arguments.get("category", ""),
name=arguments.get("name", "")
)
if result.get("success"):
_remove_type(workspace_store, arguments.get("category", ""), arguments.get("name", ""))
cleanup_result = cleanup_extra_packages(get_data_dir(), _get_env_yml())
cleanup_result = await cleanup_extra_packages_async(get_data_dir(), _get_env_yml())
if cleanup_result.get("removed"):
result["packages_removed"] = cleanup_result["removed"]
parts = [f"success: {result['success']}"]
@@ -1004,14 +1004,14 @@ def create_mcp_server(config: Config, event_publisher: EventPublisher) -> Server
parts.append(f"{k}: {result[k]}")
return [TextContent(type="text", text="\n".join(parts))]
elif name == "conda_sync":
return sync_packages(
return await sync_packages_async(
data_dir=get_data_dir(),
environment_yml=_get_env_yml()
)
elif name == "conda_install":
return install_packages(arguments.get("packages", []))
return await install_packages_async(arguments.get("packages", []))
elif name == "execute_research":
result = category_manager.execute_research(name=arguments.get("name", ""))
result = await category_manager.execute_research(name=arguments.get("name", ""))
if "error" in result:
logging.error(f"execute_research '{arguments.get('name')}': {result['error']}")
return [TextContent(type="text", text=f"Error: {result['error']}")]
@@ -1113,6 +1113,8 @@ def create_streamable_http_app(mcp_server: Server) -> Starlette:
@contextlib.asynccontextmanager
async def lifespan(app: Starlette):
from dexorder.event_loop import install_thread_safe_asyncio_run
install_thread_safe_asyncio_run(asyncio.get_running_loop())
async with session_manager.run():
yield
@@ -1156,6 +1158,14 @@ class UserContainer:
# Load configuration
self.config.load()
# Python-level memory guard (RLIMIT_AS soft limit) — DISABLED.
# We assume nodes have ample memory (8Gi limits) and will revisit a
# proper RSS-based cgroup monitor later. The implementation is in
# dexorder/memory_guard.py if we want to re-enable.
# from dexorder.memory_guard import setup_memory_limit
# mem_cfg = self.config.config_data.get("memory", {})
# setup_memory_limit(fraction=float(mem_cfg.get("limit_fraction", 0.85)))
# Initialize data and charting API
data_cfg = self.config.config_data.get("data", {})
iceberg_cfg = data_cfg.get("iceberg", {})

View File

@@ -9,6 +9,8 @@ import { useShapeStore } from './stores/shapes'
import { useIndicatorStore } from './stores/indicators'
import { useIndicatorTypesStore } from './stores/indicatorTypes'
import { useChannelStore } from './stores/channel'
import { useResearchTypesStore } from './stores/researchTypes'
import { useStrategyTypesStore } from './stores/strategyTypes'
import { useStateSync } from './composables/useStateSync'
import { wsManager } from './composables/useWebSocket'
import { authService } from './composables/useAuth'
@@ -44,9 +46,18 @@ function onHDragMove(e: PointerEvent) {
chartWidth.value = Math.max(CHART_MIN_PX, Math.min(maxWidth, hDragStartWidth + delta))
}
// Clamp chartWidth so chart + chat always fit within the window
function clampChartWidth() {
const maxWidth = window.innerWidth - CHAT_MIN_PX - 4
if (maxWidth >= CHART_MIN_PX) {
chartWidth.value = Math.max(CHART_MIN_PX, Math.min(maxWidth, chartWidth.value))
}
}
// Check screen width for mobile layout
const checkMobile = () => {
isMobile.value = window.innerWidth < 768
if (!isMobile.value) clampChartWidth()
}
const chartStore = useChartStore()
@@ -108,11 +119,15 @@ const initializeApp = async () => {
const indicatorStore = useIndicatorStore()
const indicatorTypesStore = useIndicatorTypesStore()
const channelStore = useChannelStore()
const researchTypesStore = useResearchTypesStore()
const strategyTypesStore = useStrategyTypesStore()
const stateSync = useStateSync({
chartState: chartStore,
shapes: shapeStore,
indicators: indicatorStore,
indicator_types: indicatorTypesStore,
research_types: researchTypesStore,
strategy_types: strategyTypesStore,
channelState: channelStore
})
stateSyncCleanup = stateSync.cleanup
@@ -195,7 +210,7 @@ onBeforeUnmount(() => {
.chat-panel {
flex: 1;
min-width: 0;
min-width: 240px;
height: 100%;
overflow: hidden;
display: flex;

View File

@@ -7,6 +7,7 @@ import TabPanels from 'primevue/tabpanels'
import TabPanel from 'primevue/tabpanel'
import OrdersTab from './tabs/OrdersTab.vue'
import PlaceholderTab from './tabs/PlaceholderTab.vue'
import ResearchTab from './tabs/ResearchTab.vue'
interface TempTab {
id: string
@@ -81,9 +82,10 @@ defineExpose({
<template>
<div class="bottom-tray" :style="trayStyle">
<div v-if="isExpanded" class="tray-resize-handle" @pointerdown="startResize" @pointermove="onResizeMove" />
<Tabs :value="activeTab" class="tray-tabs">
<Tabs :value="isExpanded ? activeTab : null" class="tray-tabs">
<TabList class="tray-tab-list">
<Tab value="orders" @click="onTabClick('orders')">Orders</Tab>
<Tab value="research" @click="onTabClick('research')">Research</Tab>
<Tab value="strategies" @click="onTabClick('strategies')">Strategies</Tab>
<Tab value="positions" @click="onTabClick('positions')">Positions</Tab>
<Tab
@@ -102,9 +104,10 @@ defineExpose({
</button>
</TabList>
<TabPanels v-if="isExpanded" class="tray-panels">
<TabPanel value="positions" class="tray-panel"><PlaceholderTab label="Positions" /></TabPanel>
<TabPanel value="orders" class="tray-panel"><OrdersTab /></TabPanel>
<TabPanel value="strategies" class="tray-panel"><PlaceholderTab label="Strategies" /></TabPanel>
<TabPanel value="positions" class="tray-panel"><PlaceholderTab label="Positions" /></TabPanel>
<TabPanel value="research" class="tray-panel"><ResearchTab /></TabPanel>
<TabPanel
v-for="tab in tempTabs"
:key="tab.id"

View File

@@ -1,5 +1,5 @@
<script setup lang="ts">
import { ref, onMounted, onBeforeUnmount, watch } from 'vue'
import { ref, onMounted, onBeforeUnmount, watch, type WatchStopHandle } from 'vue'
import Card from 'primevue/card'
import { createTradingViewDatafeed } from '../composables/useTradingViewDatafeed'
import { useTradingViewShapes } from '../composables/useTradingViewShapes'
@@ -11,10 +11,11 @@ import type { IChartingLibraryWidget } from '../types/tradingview'
import { intervalToSeconds } from '../utils'
import { wsManager } from '../composables/useWebSocket'
// Convert seconds to TradingView interval string
// Convert seconds to TradingView interval string.
// TradingView uses plain minute numbers ("60", "240") for intraday,
// and "1D", "2D" etc for daily. Never use "H" suffix — it's not in supported_resolutions.
function secondsToInterval(seconds: number): string {
if (seconds % 86400 === 0) return `${seconds / 86400}D`
if (seconds % 3600 === 0) return `${seconds / 3600}H`
return `${seconds / 60}` // plain number = minutes
}
@@ -27,9 +28,23 @@ let shapeCleanup: (() => void) | null = null // Cleanup function for shape sync
let indicatorCleanup: (() => void) | null = null // Cleanup function for indicator sync
let customIndicatorCleanup: (() => void) | null = null // Cleanup for custom TV studies
let chartInitialized = false // Guard against double-init on reconnect
let symbolWatcher: WatchStopHandle | null = null
const maybeInitChart = () => {
if (chartInitialized || !chartContainer.value) return
if (!chartStore.symbol) {
// Defer until backend provides a symbol
if (!symbolWatcher) {
symbolWatcher = watch(() => chartStore.symbol, (sym) => {
if (sym) {
symbolWatcher?.()
symbolWatcher = null
maybeInitChart()
}
})
}
return
}
chartInitialized = true
initChart()
}
@@ -205,6 +220,10 @@ function setupStoreWatchers() {
}
onBeforeUnmount(() => {
if (symbolWatcher) {
symbolWatcher()
symbolWatcher = null
}
// Cleanup shape synchronization
if (shapeCleanup) {
shapeCleanup()

View File

@@ -2,7 +2,6 @@
import { ref, onMounted, onUnmounted, computed, onBeforeUnmount, watch, nextTick } from 'vue'
import { register } from 'vue-advanced-chat'
import Badge from 'primevue/badge'
import Button from 'primevue/button'
import { wsManager } from '../composables/useWebSocket'
import type { WebSocketMessage } from '../composables/useWebSocket'
import { useChannelStore } from '../stores/channel'
@@ -188,12 +187,13 @@ const handleMessage = (data: WebSocketMessage) => {
}
} else if (data.type === 'agent_chunk') {
console.log('[ChatPanel] Processing agent_chunk, content:', data.content, 'done:', data.done)
// Always remove any tool-call bubble when the agent sends text, whether this
// is a new message or a continuation of an existing one (e.g. after a retry).
removeToolCallBubble()
const timestamp = new Date().toTimeString().split(' ')[0].slice(0, 5)
if (!currentStreamingMessageId) {
console.log('[ChatPanel] Starting new streaming message')
// Remove any ephemeral tool-call bubble before starting the real response
removeToolCallBubble()
// Set up streaming state and mark user message as seen
isAgentProcessing.value = true
currentStreamingMessageId = generateMessageId()
@@ -314,6 +314,7 @@ const stopAgent = () => {
// Send message handler
const sendMessage = async (event: any) => {
if (isAgentProcessing.value) { stopAgent(); return }
// Extract data from CustomEvent.detail[0]
const data = event.detail?.[0] || event
@@ -617,7 +618,11 @@ onUnmounted(() => {
<!-- Workspace loading overlay -->
<div v-if="!channelStore.isReady" class="workspace-loading">
<i class="pi pi-spin pi-spinner workspace-loading-spinner" />
<svg class="workspace-loading-spinner" viewBox="0 0 50 50" xmlns="http://www.w3.org/2000/svg">
<circle cx="25" cy="25" r="20" fill="none" stroke="rgba(8,153,129,0.2)" stroke-width="4"/>
<circle cx="25" cy="25" r="20" fill="none" stroke="#089981" stroke-width="4"
stroke-dasharray="80 200" stroke-linecap="round"/>
</svg>
<span class="workspace-loading-message">{{ channelStore.statusMessage || 'Connecting...' }}</span>
</div>
@@ -643,18 +648,18 @@ onUnmounted(() => {
@send-message="sendMessage"
@fetch-messages="fetchMessages"
@open-file="openFile"
/>
<!-- Stop button overlay -->
<div v-if="isAgentProcessing" class="stop-button-container">
<Button
icon="pi pi-stop-circle"
label="Stop"
severity="danger"
@click="stopAgent"
class="stop-button"
/>
</div>
>
<div
v-if="isAgentProcessing"
slot="send-icon"
@click.stop="stopAgent"
style="display:flex;align-items:center;justify-content:center;width:100%;height:100%"
>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20" height="20">
<rect x="4" y="4" width="16" height="16" rx="2" fill="#f23645"/>
</svg>
</div>
</vue-advanced-chat>
</div>
</template>
@@ -682,8 +687,13 @@ onUnmounted(() => {
}
.workspace-loading-spinner {
font-size: 2rem;
color: #089981;
width: 2rem;
height: 2rem;
animation: workspace-spin 0.8s linear infinite;
}
@keyframes workspace-spin {
to { transform: rotate(360deg); }
}
.workspace-loading-message {
@@ -721,24 +731,4 @@ onUnmounted(() => {
color: var(--p-surface-900);
}
.stop-button-container {
position: absolute;
bottom: 80px;
right: 20px;
z-index: 1000;
}
.stop-button {
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
animation: pulse 2s infinite;
}
@keyframes pulse {
0%, 100% {
opacity: 1;
}
50% {
opacity: 0.8;
}
}
</style>

View File

@@ -0,0 +1,108 @@
<script setup lang="ts">
import { ref, computed } from 'vue'
import { storeToRefs } from 'pinia'
import { useResearchTypesStore } from '../../stores/researchTypes'
const store = useResearchTypesStore()
const { types } = storeToRefs(store)
const expanded = ref<Set<string>>(new Set())
const rows = computed(() =>
Object.entries(types.value).map(([id, t]) => ({ id, ...t }))
)
function toggle(id: string) {
if (expanded.value.has(id)) {
expanded.value.delete(id)
} else {
expanded.value.add(id)
}
}
</script>
<template>
<div class="research-tab">
<div v-if="rows.length === 0" class="empty">No research items</div>
<div v-for="row in rows" :key="row.id" class="research-row">
<button class="row-header" @click="toggle(row.id)">
<i class="pi" :class="expanded.has(row.id) ? 'pi-chevron-down' : 'pi-chevron-right'" />
<span class="row-name">{{ row.display_name }}</span>
<span class="row-id">{{ row.id }}</span>
</button>
<div v-if="expanded.has(row.id)" class="row-body">
<span v-if="row.description">{{ row.description }}</span>
<span v-else class="no-desc">No description</span>
</div>
</div>
</div>
</template>
<style scoped>
.research-tab {
flex: 1;
overflow-y: auto;
display: flex;
flex-direction: column;
}
.empty {
color: #555;
text-align: center;
padding: 16px;
font-size: 12px;
}
.research-row {
border-bottom: 1px solid #1e1e1e;
}
.row-header {
display: flex;
align-items: center;
gap: 6px;
width: 100%;
background: none;
border: none;
padding: 5px 10px;
cursor: pointer;
text-align: left;
color: #dbdbdb;
font-size: 12px;
}
.row-header:hover {
background: #1a1a1a;
}
.row-header .pi {
color: #666;
font-size: 10px;
flex-shrink: 0;
}
.row-name {
flex: 1;
font-weight: 500;
}
.row-id {
color: #555;
font-size: 11px;
font-family: monospace;
}
.row-body {
padding: 6px 26px 8px;
font-size: 12px;
color: #aaa;
line-height: 1.5;
background: #0d0d0d;
white-space: pre-wrap;
}
.no-desc {
color: #444;
font-style: italic;
}
</style>

View File

@@ -60,8 +60,6 @@ export function useStateSync(stores: Record<string, Store>) {
currentSeqs[msg.store] = msg.seq;
saveStoredSeqs(currentSeqs);
console.log('[StateSync] Snapshot applied, new seq:', msg.seq);
} else {
console.warn('[StateSync] Store not found:', msg.store);
}
} else if (msg.type === 'patch') {
console.log('[StateSync] Processing patch for store:', msg.store, 'seq:', msg.seq);
@@ -89,8 +87,6 @@ export function useStateSync(stores: Record<string, Store>) {
currentSeqs[msg.store] = msg.seq;
saveStoredSeqs(currentSeqs);
console.log('[StateSync] Patch applied successfully, new seq:', msg.seq);
} else {
console.warn('[StateSync] Store not found:', msg.store);
}
}
};

View File

@@ -263,7 +263,10 @@ export class WebSocketDatafeed implements IBasicDataFeed {
throw err
})
.then((response) => {
if (response.history) {
if (response.error) {
console.error('[TradingView Datafeed] getBars server error:', response.error)
onError(response.error)
} else if (response.history) {
console.log('[TradingView Datafeed] Raw bar sample:', response.history.bars?.[0])
console.log('[TradingView Datafeed] Denominators:', denoms)
@@ -309,7 +312,7 @@ export class WebSocketDatafeed implements IBasicDataFeed {
this.sendRequest<any>({
type: 'subscribe_bars',
symbol: symbolInfo.ticker || symbolInfo.name,
resolution: resolution,
period_seconds: intervalToSeconds(resolution),
subscription_id: listenerGuid
})
.then((response) => {
@@ -328,8 +331,10 @@ export class WebSocketDatafeed implements IBasicDataFeed {
}
unsubscribeBars(listenerGuid: string): void {
const sub = this.subscriptions.get(listenerGuid)
this.sendRequest<any>({
type: 'unsubscribe_bars',
period_seconds: sub ? intervalToSeconds(sub.resolution) : 60,
subscription_id: listenerGuid
})
.then(() => {

View File

@@ -30,8 +30,14 @@ class WebSocketManager {
async connect(token: string): Promise<void> {
this.token = token
// Close existing connection if any
// Close existing connection if any — null out handlers first so the async
// onclose event from the old socket cannot reset sessionStatus after the
// new socket has already reached 'ready'.
if (this.ws) {
this.ws.onopen = null
this.ws.onmessage = null
this.ws.onerror = null
this.ws.onclose = null
this.ws.close()
this.ws = null
}

View File

@@ -0,0 +1,14 @@
import { defineStore } from 'pinia'
import { ref } from 'vue'
export interface ResearchType {
display_name: string
description?: string
created_at: number
modified_at: number
}
export const useResearchTypesStore = defineStore('research_types', () => {
const types = ref<Record<string, ResearchType>>({})
return { types }
})

View File

@@ -0,0 +1,14 @@
import { defineStore } from 'pinia'
import { ref } from 'vue'
export interface StrategyType {
display_name: string
description?: string
created_at: number
modified_at: number
}
export const useStrategyTypesStore = defineStore('strategy_types', () => {
const types = ref<Record<string, StrategyType>>({})
return { types }
})