bugfixes; research subproc; higher sandbox limits

This commit is contained in:
2026-04-16 18:11:26 -04:00
parent f80c943dc3
commit 3153e89d4f
54 changed files with 1947 additions and 498 deletions

View File

@@ -1,6 +1,37 @@
// CCXT data fetcher for historical OHLC and realtime ticks
import ccxt from 'ccxt';
/**
* Thrown when an exchange returns a 429 rate-limit response.
* retryAfterMs is derived from the exchange's Retry-After header when available.
*/
export class ExchangeRateLimitError extends Error {
constructor(exchange, retryAfterMs, originalMessage) {
super(`Rate limit on ${exchange}: retry after ${retryAfterMs}ms (${originalMessage})`);
this.name = 'ExchangeRateLimitError';
this.exchange = exchange.toUpperCase();
this.retryAfterMs = retryAfterMs;
}
}
/**
* Extract retry-after duration in milliseconds from a CCXT RateLimitExceeded error.
* Priority: Retry-After header → error message numeric → 30s fallback.
*/
function extractRetryAfterMs(exchange, error) {
const header = exchange.last_response_headers?.['retry-after'];
if (header) {
const secs = parseFloat(header);
if (!isNaN(secs)) return Math.ceil(secs * 1000);
}
// Some exchanges embed the delay in the message (e.g. "retry after 5000 ms")
const msMatch = error.message?.match(/(\d+)\s*ms/i);
if (msMatch) return parseInt(msMatch[1], 10);
const secMatch = error.message?.match(/(\d+(?:\.\d+)?)\s*s(?:ec|econds?)?/i);
if (secMatch) return Math.ceil(parseFloat(secMatch[1]) * 1000);
return 30_000;
}
export class CCXTFetcher {
constructor(config, logger, metadataGenerator = null) {
this.config = config;
@@ -135,9 +166,12 @@ export class CCXTFetcher {
break;
} catch (error) {
lastError = error;
const isRetryable = error.constructor?.name === 'NetworkError' ||
const isRateLimit = error.constructor?.name === 'RateLimitExceeded';
const isRetryable = !isRateLimit && (
error.constructor?.name === 'NetworkError' ||
error.constructor?.name === 'RequestTimeout' ||
error.constructor?.name === 'ExchangeNotAvailable';
error.constructor?.name === 'ExchangeNotAvailable'
);
this.logger.warn(
{
errorType: error.constructor?.name,
@@ -146,15 +180,21 @@ export class CCXTFetcher {
ticker,
since,
attempt,
retryable: isRetryable
retryable: isRetryable,
rateLimit: isRateLimit
},
'OHLC fetch attempt failed'
);
if (!isRetryable || attempt === FETCH_RETRIES) break;
if (isRateLimit || !isRetryable || attempt === FETCH_RETRIES) break;
await exchange.sleep(FETCH_RETRY_DELAY_MS * attempt);
}
}
if (lastError) {
if (lastError.constructor?.name === 'RateLimitExceeded') {
const retryAfterMs = extractRetryAfterMs(exchange, lastError);
this.logger.warn({ ticker, retryAfterMs }, 'OHLC fetch rate-limited by exchange');
throw new ExchangeRateLimitError(exchangeName, retryAfterMs, lastError.message);
}
this.logger.error(
{
errorType: lastError.constructor?.name,
@@ -278,6 +318,11 @@ export class CCXTFetcher {
// Convert to our Tick format
return trades.map(trade => this.convertToTick(trade, ticker, metadata));
} catch (error) {
if (error.constructor?.name === 'RateLimitExceeded') {
const retryAfterMs = extractRetryAfterMs(exchange, error);
this.logger.warn({ ticker, retryAfterMs }, 'Trades fetch rate-limited by exchange');
throw new ExchangeRateLimitError(exchangeName, retryAfterMs, error.message);
}
this.logger.error(
{ error: error.message, ticker },
'Error fetching trades'

View File

@@ -6,9 +6,10 @@ import { parse as parseYaml } from 'yaml';
import pino from 'pino';
import { ZmqClient } from './zmq-client.js';
import { KafkaProducer } from './kafka-producer.js';
import { CCXTFetcher } from './ccxt-fetcher.js';
import { CCXTFetcher, ExchangeRateLimitError } from './ccxt-fetcher.js';
import { RealtimePoller } from './realtime-poller.js';
import { SymbolMetadataGenerator } from './symbol-metadata-generator.js';
import { SlotType } from './proto/messages.js';
// Logger setup
const logger = pino({
@@ -64,10 +65,162 @@ function loadConfig() {
supported_exchanges: config.supported_exchanges || ['binance', 'coinbase', 'kraken'],
symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000,
// Per-exchange slot capacity
exchange_capacity: config.exchange_capacity || {
BINANCE: { historical_slots: 3, realtime_slots: 5 },
KRAKEN: { historical_slots: 2, realtime_slots: 3 },
COINBASE: { historical_slots: 2, realtime_slots: 4 }
},
...secrets
};
}
/**
* Manages work slots per exchange per job type.
*
* Each slot corresponds to one WorkerReady message sent to Flink. Flink consumes
* a slot when it dispatches a job. The slot is re-offered (via another WorkerReady)
* once the job completes, subject to any rate-limit backoff dictated by the exchange.
*/
class SlotPool {
constructor(exchangeCapacity, zmqClient, logger) {
this.zmqClient = zmqClient;
this.logger = logger;
// Key: 'EXCHANGE|TYPE' (e.g. 'BINANCE|HISTORICAL')
// Value: { max, active: Set<jobId>, backoffUntil: ms timestamp }
this.slots = new Map();
for (const [exchange, cap] of Object.entries(exchangeCapacity)) {
const ex = exchange.toUpperCase();
this.slots.set(`${ex}|HISTORICAL`, {
max: cap.historical_slots ?? 2,
active: new Set(),
backoffUntil: 0
});
this.slots.set(`${ex}|REALTIME`, {
max: cap.realtime_slots ?? 3,
active: new Set(),
backoffUntil: 0
});
}
// jobId → { exchange, type } for release tracking
this.jobMap = new Map();
}
/**
* Register the onConnected callback so slot offers are sent on every
* TCP (re)connect rather than once at startup. Handles both the initial
* connection race (Flink ROUTER not yet ready) and Flink restarts.
*/
init() {
this.zmqClient.onConnected = () => this._offerAllFreeSlots();
this.logger.info(
{ slots: [...this.slots.entries()].map(([k, v]) => `${k}:${v.max}`) },
'Slot pool initialized — will offer slots on connect'
);
}
/**
* Re-offer all currently-free slots. Called on every TCP (re)connect.
* Sends (max - active) WorkerReady messages per exchange+type key.
*/
async _offerAllFreeSlots() {
const summary = [];
for (const [key, slot] of this.slots) {
const [exchange, type] = key.split('|');
const freeCount = slot.max - slot.active.size;
for (let i = 0; i < freeCount; i++) {
await this.zmqClient.sendTypedReady(exchange, SlotType[type]);
}
summary.push(`${key}:${freeCount}/${slot.max}`);
}
this.logger.info({ offered: summary }, 'Re-offered all free slots on connect');
}
/**
* Record a slot as occupied by jobId.
* @param {string} jobId
* @param {string} exchange - e.g. 'BINANCE'
* @param {string} type - 'HISTORICAL' | 'REALTIME'
*/
consumeSlot(jobId, exchange, type) {
const key = `${exchange.toUpperCase()}|${type}`;
const slot = this.slots.get(key);
if (slot) {
if (slot.active.size >= slot.max) {
this.logger.warn({ jobId, key, active: slot.active.size, max: slot.max }, 'Slot capacity exceeded — rejecting job');
return false;
}
slot.active.add(jobId);
this.jobMap.set(jobId, { exchange: exchange.toUpperCase(), type });
this.logger.debug({ jobId, key, active: slot.active.size, max: slot.max }, 'Slot consumed');
return true;
}
this.logger.warn({ jobId, key }, 'No slot config for this exchange+type');
return false;
}
/**
* Release the slot occupied by jobId and re-offer it to Flink (after any backoff).
*/
async releaseSlot(jobId) {
const info = this.jobMap.get(jobId);
if (!info) {
this.logger.warn({ jobId }, 'releaseSlot called for unknown jobId');
return;
}
this.jobMap.delete(jobId);
const key = `${info.exchange}|${info.type}`;
const slot = this.slots.get(key);
if (slot) {
slot.active.delete(jobId);
await this._offerSlot(info.exchange, info.type, slot);
}
}
/**
* Record a rate limit from the exchange. Delays slot re-offer by retryAfterMs.
* @param {string} exchange
* @param {string} type - 'HISTORICAL' | 'REALTIME'
* @param {number} retryAfterMs
*/
reportRateLimit(exchange, type, retryAfterMs) {
const key = `${exchange.toUpperCase()}|${type}`;
const slot = this.slots.get(key);
if (slot) {
slot.backoffUntil = Math.max(slot.backoffUntil, Date.now() + retryAfterMs);
this.logger.warn({ exchange, type, retryAfterMs }, 'Rate limit backoff set for slot');
}
}
async _offerSlot(exchange, type, slot) {
const now = Date.now();
if (now < slot.backoffUntil) {
const delay = slot.backoffUntil - now;
this.logger.info({ exchange, type, delayMs: delay }, 'Slot in backoff — scheduling re-offer');
setTimeout(() => this._offerSlot(exchange, type, slot), delay);
return;
}
try {
await this.zmqClient.sendTypedReady(exchange, SlotType[type]);
this.logger.debug({ exchange, type }, 'Slot re-offered to Flink');
} catch (err) {
this.logger.error({ exchange, type, error: err.message }, 'Failed to re-offer slot');
}
}
shutdown() {}
}
/** Extract exchange name from ticker string, e.g. "BTC/USDT.BINANCE" → "BINANCE" */
function exchangeOf(ticker) {
const lastDot = ticker?.lastIndexOf('.');
return (lastDot >= 0) ? ticker.slice(lastDot + 1).toUpperCase() : 'UNKNOWN';
}
class IngestorWorker {
constructor(config, logger) {
this.config = config;
@@ -92,7 +245,22 @@ class IngestorWorker {
logger.child({ component: 'poller' })
);
// jobId → active realtime subscription (for stop handling)
this.pool = new SlotPool(
config.exchange_capacity,
this.zmqClient,
logger.child({ component: 'pool' })
);
// When realtime poller terminates a subscription due to repeated errors, release its slot.
this.realtimePoller.onJobComplete = (jobId, error) => {
if (error instanceof ExchangeRateLimitError) {
this.pool.reportRateLimit(error.exchange, 'REALTIME', error.retryAfterMs);
}
this.pool.releaseSlot(jobId).catch(err =>
this.logger.error({ jobId, error: err.message }, 'Failed to release slot after realtime error'));
};
// jobId set for active realtime subscriptions
this.activeRealtime = new Set();
this.isShutdown = false;
@@ -108,7 +276,10 @@ class IngestorWorker {
this.zmqClient.onWorkAssign = req => this.handleWorkAssign(req);
this.zmqClient.onWorkStop = jobId => this.handleWorkStop(jobId);
await this.zmqClient.connect(); // also sends WorkerReady
// Register slot offer callback before connecting so we don't miss the event
this.pool.init();
await this.zmqClient.connect();
// Generate symbol metadata on startup
this.logger.info('Generating initial symbol metadata');
@@ -139,18 +310,26 @@ class IngestorWorker {
*/
handleWorkAssign(request) {
const { jobId, requestId, type, ticker } = request;
const exchange = exchangeOf(ticker);
this.logger.info({ jobId, requestId, type, ticker }, 'Received WorkAssign');
this.logger.info({ jobId, requestId, type, ticker, exchange }, 'Received WorkAssign');
// HISTORICAL_OHLC = 0 (proto3 default, may appear as undefined or 'HISTORICAL_OHLC')
const isHistorical = !type || type === 'HISTORICAL_OHLC' || type === 0;
const isRealtime = type === 'REALTIME_TICKS' || type === 1;
if (isHistorical) {
if (!this.pool.consumeSlot(jobId, exchange, 'HISTORICAL')) {
this.zmqClient.sendReject(jobId, 'Slot capacity exceeded').catch(() => {});
return;
}
this.handleHistoricalRequest(request).catch(err => {
this.logger.error({ jobId, requestId, error: err.message }, 'Unexpected error in historical handler');
});
} else if (isRealtime) {
if (!this.pool.consumeSlot(jobId, exchange, 'REALTIME')) {
this.zmqClient.sendReject(jobId, 'Slot capacity exceeded').catch(() => {});
return;
}
this.handleRealtimeRequest(request);
} else {
this.logger.warn({ jobId, type }, 'Unknown request type — rejecting');
@@ -165,7 +344,9 @@ class IngestorWorker {
this.logger.info({ jobId }, 'Received WorkStop — cancelling realtime subscription');
this.realtimePoller.cancelSubscription(jobId);
this.activeRealtime.delete(jobId);
// No WorkComplete needed — Flink sent the stop, it already knows
this.pool.releaseSlot(jobId).catch(err =>
this.logger.warn({ jobId, error: err.message }, 'Failed to release slot after WorkStop'));
// No WorkComplete needed — Flink sent the stop, it already knows.
}
/**
@@ -174,10 +355,14 @@ class IngestorWorker {
*/
async handleHistoricalRequest(request) {
const { jobId, requestId, ticker, historical, clientId: client_id } = request;
const exchange = exchangeOf(ticker);
const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical || {};
this.logger.info({ jobId, requestId, ticker, period_seconds }, 'Processing historical OHLC request');
// Immediately ack to reset Flink's dispatch-time timeout clock.
await this.zmqClient.sendHeartbeat(jobId);
try {
const candles = await this.ccxtFetcher.fetchHistoricalOHLC(
ticker, start_time, end_time, period_seconds, limit
@@ -193,7 +378,10 @@ class IngestorWorker {
const isLastPage = (i + PAGE_SIZE) >= candles.length;
await this.kafkaProducer.writeOHLCs(this.config.kafka_ohlc_topic, page, metadata, isLastPage);
}
this.logger.info({ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) }, 'Wrote all pages to Kafka');
this.logger.info(
{ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) },
'Wrote all pages to Kafka'
);
} else {
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
@@ -207,6 +395,10 @@ class IngestorWorker {
} catch (error) {
this.logger.error({ jobId, requestId, ticker, error: error.message }, 'Historical request failed');
if (error instanceof ExchangeRateLimitError) {
this.pool.reportRateLimit(exchange, 'HISTORICAL', error.retryAfterMs);
}
try {
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
@@ -218,11 +410,14 @@ class IngestorWorker {
await this.zmqClient.sendComplete(jobId, false, error.message);
}
// Release slot regardless of success or failure
this.pool.releaseSlot(jobId).catch(err =>
this.logger.error({ jobId, error: err.message }, 'Failed to release historical slot'));
}
/**
* Start realtime tick polling for a job dispatched by Flink.
* Ticks flow: exchange → Kafka market-tick → Flink → OHLC bars → clients.
*/
handleRealtimeRequest(request) {
const { jobId, requestId, ticker } = request;
@@ -247,6 +442,7 @@ class IngestorWorker {
if (this.metadataInterval) clearInterval(this.metadataInterval);
this.pool.shutdown();
this.realtimePoller.shutdown();
await this.ccxtFetcher.close();
await this.metadataGenerator.close();

View File

@@ -18,6 +18,10 @@ export class RealtimePoller {
this.pollingLoop = null;
this.heartbeatLoop = null;
// Called with (jobId, error) when a subscription terminates abnormally.
// Set by IngestorWorker to release the slot in SlotPool.
this.onJobComplete = null;
}
/**
@@ -147,6 +151,7 @@ export class RealtimePoller {
} catch (zmqErr) {
this.logger.error({ jobId, error: zmqErr.message }, 'Failed to send WorkComplete after error');
}
if (this.onJobComplete) this.onJobComplete(jobId, error);
}
}
}

View File

@@ -28,63 +28,61 @@ export class ZmqClient {
this.dealerSocket = null;
this.isShutdown = false;
this.activeJobId = null;
this._idleHeartbeatInterval = null;
this.supportedExchanges = (config.supported_exchanges || ['BINANCE', 'COINBASE'])
.map(e => e.toUpperCase());
// Callbacks set by IngestorWorker
this.onWorkAssign = null; // (DataRequest) => void
this.onWorkStop = null; // (jobId) => void
// Callbacks set by IngestorWorker / SlotPool
this.onWorkAssign = null; // (DataRequest) => void
this.onWorkStop = null; // (jobId) => void
this.onConnected = null; // async () => void — fires on initial connect AND reconnect
}
/**
* Connect DEALER socket to Flink IngestorBroker (ROUTER).
* Sends WorkerReady immediately so Flink knows this worker is available.
* Fires onConnected on every TCP (re)connect so SlotPool can re-offer slots.
*/
async connect() {
const { flink_hostname, ingestor_broker_port = 5567 } = this.config;
this.dealerSocket = new zmq.Dealer();
const endpoint = `tcp://${flink_hostname}:${ingestor_broker_port}`;
await this.dealerSocket.connect(endpoint);
this.logger.info(`Connected DEALER to Flink IngestorBroker at ${endpoint}`);
// Register as available
await this.sendReady();
// Periodically re-send WorkerReady when idle, to recover from missed initial registration
this._idleHeartbeatInterval = setInterval(() => {
if (this.activeJobId === null && !this.isShutdown) {
this.sendReady().catch(err =>
this.logger.warn({ error: err.message }, 'Failed to re-send WorkerReady'));
// Subscribe to connection events BEFORE calling connect() so we catch the
// initial establishment. The 'connect' event fires on initial TCP handshake
// and again after every ZMQ reconnect (e.g. Flink restart).
this.dealerSocket.events.on('connect', ({ address }) => {
this.logger.info({ address }, 'DEALER connected to broker');
if (this.onConnected) {
this.onConnected().catch(err =>
this.logger.error({ error: err.message }, 'onConnected callback failed'));
}
}, 30_000);
});
const endpoint = `tcp://${flink_hostname}:${ingestor_broker_port}`;
this.dealerSocket.connect(endpoint);
this.logger.info(`Connecting DEALER to Flink IngestorBroker at ${endpoint}`);
// Start receiving work in background
this._receiveLoop();
}
/**
* Send WorkerReady — called on connect and after each COMPLETE.
* Send one typed WorkerReady slot offer.
* @param {string} exchange - Exchange name (e.g. 'BINANCE')
* @param {number} slotType - SlotType enum value (0=ANY, 1=HISTORICAL, 2=REALTIME)
*/
async sendReady() {
async sendTypedReady(exchange, slotType) {
const frames = encodeBrokerMessage(
MessageTypeId.WORKER_READY,
{ exchanges: this.supportedExchanges },
{ exchanges: [exchange], jobType: slotType },
WorkerReady
);
await this.dealerSocket.send(frames);
this.logger.info({ exchanges: this.supportedExchanges }, 'Sent WorkerReady');
this.logger.debug({ exchange, slotType }, 'Sent WorkerReady slot offer');
}
/**
* Send WorkComplete after a historical job finishes.
* Automatically sends WorkerReady so Flink returns us to the free pool.
* Slot re-registration is handled by SlotPool after this call.
*/
async sendComplete(jobId, success, errorMessage) {
this.activeJobId = null;
const frames = encodeBrokerMessage(
MessageTypeId.WORK_COMPLETE,
{
@@ -96,9 +94,6 @@ export class ZmqClient {
);
await this.dealerSocket.send(frames);
this.logger.info({ jobId, success }, 'Sent WorkComplete');
// Return to free pool
await this.sendReady();
}
/**
@@ -153,12 +148,10 @@ export class ZmqClient {
const payload = frames[2].slice(1);
if (typeId === MessageTypeId.WORK_ASSIGN) {
// DataRequest protobuf
const request = DataRequest.decode(payload);
const req = DataRequest.toObject(request, {
longs: String, enums: String, bytes: Buffer
});
this.activeJobId = req.jobId;
this.logger.info(
{ jobId: req.jobId, requestId: req.requestId, type: req.type, ticker: req.ticker },
'Received WorkAssign from broker'
@@ -192,10 +185,6 @@ export class ZmqClient {
async shutdown() {
this.isShutdown = true;
if (this._idleHeartbeatInterval) {
clearInterval(this._idleHeartbeatInterval);
this._idleHeartbeatInterval = null;
}
this.logger.info('Shutting down ZMQ DEALER connection');
if (this.dealerSocket) {
this.dealerSocket.close();