bugfixes; research subproc; higher sandbox limits
This commit is contained in:
@@ -6,9 +6,10 @@ import { parse as parseYaml } from 'yaml';
|
||||
import pino from 'pino';
|
||||
import { ZmqClient } from './zmq-client.js';
|
||||
import { KafkaProducer } from './kafka-producer.js';
|
||||
import { CCXTFetcher } from './ccxt-fetcher.js';
|
||||
import { CCXTFetcher, ExchangeRateLimitError } from './ccxt-fetcher.js';
|
||||
import { RealtimePoller } from './realtime-poller.js';
|
||||
import { SymbolMetadataGenerator } from './symbol-metadata-generator.js';
|
||||
import { SlotType } from './proto/messages.js';
|
||||
|
||||
// Logger setup
|
||||
const logger = pino({
|
||||
@@ -64,10 +65,162 @@ function loadConfig() {
|
||||
supported_exchanges: config.supported_exchanges || ['binance', 'coinbase', 'kraken'],
|
||||
symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000,
|
||||
|
||||
// Per-exchange slot capacity
|
||||
exchange_capacity: config.exchange_capacity || {
|
||||
BINANCE: { historical_slots: 3, realtime_slots: 5 },
|
||||
KRAKEN: { historical_slots: 2, realtime_slots: 3 },
|
||||
COINBASE: { historical_slots: 2, realtime_slots: 4 }
|
||||
},
|
||||
|
||||
...secrets
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Manages work slots per exchange per job type.
|
||||
*
|
||||
* Each slot corresponds to one WorkerReady message sent to Flink. Flink consumes
|
||||
* a slot when it dispatches a job. The slot is re-offered (via another WorkerReady)
|
||||
* once the job completes, subject to any rate-limit backoff dictated by the exchange.
|
||||
*/
|
||||
class SlotPool {
|
||||
constructor(exchangeCapacity, zmqClient, logger) {
|
||||
this.zmqClient = zmqClient;
|
||||
this.logger = logger;
|
||||
|
||||
// Key: 'EXCHANGE|TYPE' (e.g. 'BINANCE|HISTORICAL')
|
||||
// Value: { max, active: Set<jobId>, backoffUntil: ms timestamp }
|
||||
this.slots = new Map();
|
||||
|
||||
for (const [exchange, cap] of Object.entries(exchangeCapacity)) {
|
||||
const ex = exchange.toUpperCase();
|
||||
this.slots.set(`${ex}|HISTORICAL`, {
|
||||
max: cap.historical_slots ?? 2,
|
||||
active: new Set(),
|
||||
backoffUntil: 0
|
||||
});
|
||||
this.slots.set(`${ex}|REALTIME`, {
|
||||
max: cap.realtime_slots ?? 3,
|
||||
active: new Set(),
|
||||
backoffUntil: 0
|
||||
});
|
||||
}
|
||||
|
||||
// jobId → { exchange, type } for release tracking
|
||||
this.jobMap = new Map();
|
||||
}
|
||||
|
||||
/**
|
||||
* Register the onConnected callback so slot offers are sent on every
|
||||
* TCP (re)connect rather than once at startup. Handles both the initial
|
||||
* connection race (Flink ROUTER not yet ready) and Flink restarts.
|
||||
*/
|
||||
init() {
|
||||
this.zmqClient.onConnected = () => this._offerAllFreeSlots();
|
||||
this.logger.info(
|
||||
{ slots: [...this.slots.entries()].map(([k, v]) => `${k}:${v.max}`) },
|
||||
'Slot pool initialized — will offer slots on connect'
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Re-offer all currently-free slots. Called on every TCP (re)connect.
|
||||
* Sends (max - active) WorkerReady messages per exchange+type key.
|
||||
*/
|
||||
async _offerAllFreeSlots() {
|
||||
const summary = [];
|
||||
for (const [key, slot] of this.slots) {
|
||||
const [exchange, type] = key.split('|');
|
||||
const freeCount = slot.max - slot.active.size;
|
||||
for (let i = 0; i < freeCount; i++) {
|
||||
await this.zmqClient.sendTypedReady(exchange, SlotType[type]);
|
||||
}
|
||||
summary.push(`${key}:${freeCount}/${slot.max}`);
|
||||
}
|
||||
this.logger.info({ offered: summary }, 'Re-offered all free slots on connect');
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a slot as occupied by jobId.
|
||||
* @param {string} jobId
|
||||
* @param {string} exchange - e.g. 'BINANCE'
|
||||
* @param {string} type - 'HISTORICAL' | 'REALTIME'
|
||||
*/
|
||||
consumeSlot(jobId, exchange, type) {
|
||||
const key = `${exchange.toUpperCase()}|${type}`;
|
||||
const slot = this.slots.get(key);
|
||||
if (slot) {
|
||||
if (slot.active.size >= slot.max) {
|
||||
this.logger.warn({ jobId, key, active: slot.active.size, max: slot.max }, 'Slot capacity exceeded — rejecting job');
|
||||
return false;
|
||||
}
|
||||
slot.active.add(jobId);
|
||||
this.jobMap.set(jobId, { exchange: exchange.toUpperCase(), type });
|
||||
this.logger.debug({ jobId, key, active: slot.active.size, max: slot.max }, 'Slot consumed');
|
||||
return true;
|
||||
}
|
||||
this.logger.warn({ jobId, key }, 'No slot config for this exchange+type');
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Release the slot occupied by jobId and re-offer it to Flink (after any backoff).
|
||||
*/
|
||||
async releaseSlot(jobId) {
|
||||
const info = this.jobMap.get(jobId);
|
||||
if (!info) {
|
||||
this.logger.warn({ jobId }, 'releaseSlot called for unknown jobId');
|
||||
return;
|
||||
}
|
||||
this.jobMap.delete(jobId);
|
||||
const key = `${info.exchange}|${info.type}`;
|
||||
const slot = this.slots.get(key);
|
||||
if (slot) {
|
||||
slot.active.delete(jobId);
|
||||
await this._offerSlot(info.exchange, info.type, slot);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Record a rate limit from the exchange. Delays slot re-offer by retryAfterMs.
|
||||
* @param {string} exchange
|
||||
* @param {string} type - 'HISTORICAL' | 'REALTIME'
|
||||
* @param {number} retryAfterMs
|
||||
*/
|
||||
reportRateLimit(exchange, type, retryAfterMs) {
|
||||
const key = `${exchange.toUpperCase()}|${type}`;
|
||||
const slot = this.slots.get(key);
|
||||
if (slot) {
|
||||
slot.backoffUntil = Math.max(slot.backoffUntil, Date.now() + retryAfterMs);
|
||||
this.logger.warn({ exchange, type, retryAfterMs }, 'Rate limit backoff set for slot');
|
||||
}
|
||||
}
|
||||
|
||||
async _offerSlot(exchange, type, slot) {
|
||||
const now = Date.now();
|
||||
if (now < slot.backoffUntil) {
|
||||
const delay = slot.backoffUntil - now;
|
||||
this.logger.info({ exchange, type, delayMs: delay }, 'Slot in backoff — scheduling re-offer');
|
||||
setTimeout(() => this._offerSlot(exchange, type, slot), delay);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await this.zmqClient.sendTypedReady(exchange, SlotType[type]);
|
||||
this.logger.debug({ exchange, type }, 'Slot re-offered to Flink');
|
||||
} catch (err) {
|
||||
this.logger.error({ exchange, type, error: err.message }, 'Failed to re-offer slot');
|
||||
}
|
||||
}
|
||||
|
||||
shutdown() {}
|
||||
}
|
||||
|
||||
/** Extract exchange name from ticker string, e.g. "BTC/USDT.BINANCE" → "BINANCE" */
|
||||
function exchangeOf(ticker) {
|
||||
const lastDot = ticker?.lastIndexOf('.');
|
||||
return (lastDot >= 0) ? ticker.slice(lastDot + 1).toUpperCase() : 'UNKNOWN';
|
||||
}
|
||||
|
||||
class IngestorWorker {
|
||||
constructor(config, logger) {
|
||||
this.config = config;
|
||||
@@ -92,7 +245,22 @@ class IngestorWorker {
|
||||
logger.child({ component: 'poller' })
|
||||
);
|
||||
|
||||
// jobId → active realtime subscription (for stop handling)
|
||||
this.pool = new SlotPool(
|
||||
config.exchange_capacity,
|
||||
this.zmqClient,
|
||||
logger.child({ component: 'pool' })
|
||||
);
|
||||
|
||||
// When realtime poller terminates a subscription due to repeated errors, release its slot.
|
||||
this.realtimePoller.onJobComplete = (jobId, error) => {
|
||||
if (error instanceof ExchangeRateLimitError) {
|
||||
this.pool.reportRateLimit(error.exchange, 'REALTIME', error.retryAfterMs);
|
||||
}
|
||||
this.pool.releaseSlot(jobId).catch(err =>
|
||||
this.logger.error({ jobId, error: err.message }, 'Failed to release slot after realtime error'));
|
||||
};
|
||||
|
||||
// jobId set for active realtime subscriptions
|
||||
this.activeRealtime = new Set();
|
||||
|
||||
this.isShutdown = false;
|
||||
@@ -108,7 +276,10 @@ class IngestorWorker {
|
||||
this.zmqClient.onWorkAssign = req => this.handleWorkAssign(req);
|
||||
this.zmqClient.onWorkStop = jobId => this.handleWorkStop(jobId);
|
||||
|
||||
await this.zmqClient.connect(); // also sends WorkerReady
|
||||
// Register slot offer callback before connecting so we don't miss the event
|
||||
this.pool.init();
|
||||
|
||||
await this.zmqClient.connect();
|
||||
|
||||
// Generate symbol metadata on startup
|
||||
this.logger.info('Generating initial symbol metadata');
|
||||
@@ -139,18 +310,26 @@ class IngestorWorker {
|
||||
*/
|
||||
handleWorkAssign(request) {
|
||||
const { jobId, requestId, type, ticker } = request;
|
||||
const exchange = exchangeOf(ticker);
|
||||
|
||||
this.logger.info({ jobId, requestId, type, ticker }, 'Received WorkAssign');
|
||||
this.logger.info({ jobId, requestId, type, ticker, exchange }, 'Received WorkAssign');
|
||||
|
||||
// HISTORICAL_OHLC = 0 (proto3 default, may appear as undefined or 'HISTORICAL_OHLC')
|
||||
const isHistorical = !type || type === 'HISTORICAL_OHLC' || type === 0;
|
||||
const isRealtime = type === 'REALTIME_TICKS' || type === 1;
|
||||
|
||||
if (isHistorical) {
|
||||
if (!this.pool.consumeSlot(jobId, exchange, 'HISTORICAL')) {
|
||||
this.zmqClient.sendReject(jobId, 'Slot capacity exceeded').catch(() => {});
|
||||
return;
|
||||
}
|
||||
this.handleHistoricalRequest(request).catch(err => {
|
||||
this.logger.error({ jobId, requestId, error: err.message }, 'Unexpected error in historical handler');
|
||||
});
|
||||
} else if (isRealtime) {
|
||||
if (!this.pool.consumeSlot(jobId, exchange, 'REALTIME')) {
|
||||
this.zmqClient.sendReject(jobId, 'Slot capacity exceeded').catch(() => {});
|
||||
return;
|
||||
}
|
||||
this.handleRealtimeRequest(request);
|
||||
} else {
|
||||
this.logger.warn({ jobId, type }, 'Unknown request type — rejecting');
|
||||
@@ -165,7 +344,9 @@ class IngestorWorker {
|
||||
this.logger.info({ jobId }, 'Received WorkStop — cancelling realtime subscription');
|
||||
this.realtimePoller.cancelSubscription(jobId);
|
||||
this.activeRealtime.delete(jobId);
|
||||
// No WorkComplete needed — Flink sent the stop, it already knows
|
||||
this.pool.releaseSlot(jobId).catch(err =>
|
||||
this.logger.warn({ jobId, error: err.message }, 'Failed to release slot after WorkStop'));
|
||||
// No WorkComplete needed — Flink sent the stop, it already knows.
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -174,10 +355,14 @@ class IngestorWorker {
|
||||
*/
|
||||
async handleHistoricalRequest(request) {
|
||||
const { jobId, requestId, ticker, historical, clientId: client_id } = request;
|
||||
const exchange = exchangeOf(ticker);
|
||||
const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical || {};
|
||||
|
||||
this.logger.info({ jobId, requestId, ticker, period_seconds }, 'Processing historical OHLC request');
|
||||
|
||||
// Immediately ack to reset Flink's dispatch-time timeout clock.
|
||||
await this.zmqClient.sendHeartbeat(jobId);
|
||||
|
||||
try {
|
||||
const candles = await this.ccxtFetcher.fetchHistoricalOHLC(
|
||||
ticker, start_time, end_time, period_seconds, limit
|
||||
@@ -193,7 +378,10 @@ class IngestorWorker {
|
||||
const isLastPage = (i + PAGE_SIZE) >= candles.length;
|
||||
await this.kafkaProducer.writeOHLCs(this.config.kafka_ohlc_topic, page, metadata, isLastPage);
|
||||
}
|
||||
this.logger.info({ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) }, 'Wrote all pages to Kafka');
|
||||
this.logger.info(
|
||||
{ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) },
|
||||
'Wrote all pages to Kafka'
|
||||
);
|
||||
} else {
|
||||
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
|
||||
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
|
||||
@@ -207,6 +395,10 @@ class IngestorWorker {
|
||||
} catch (error) {
|
||||
this.logger.error({ jobId, requestId, ticker, error: error.message }, 'Historical request failed');
|
||||
|
||||
if (error instanceof ExchangeRateLimitError) {
|
||||
this.pool.reportRateLimit(exchange, 'HISTORICAL', error.retryAfterMs);
|
||||
}
|
||||
|
||||
try {
|
||||
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
|
||||
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
|
||||
@@ -218,11 +410,14 @@ class IngestorWorker {
|
||||
|
||||
await this.zmqClient.sendComplete(jobId, false, error.message);
|
||||
}
|
||||
|
||||
// Release slot regardless of success or failure
|
||||
this.pool.releaseSlot(jobId).catch(err =>
|
||||
this.logger.error({ jobId, error: err.message }, 'Failed to release historical slot'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Start realtime tick polling for a job dispatched by Flink.
|
||||
* Ticks flow: exchange → Kafka market-tick → Flink → OHLC bars → clients.
|
||||
*/
|
||||
handleRealtimeRequest(request) {
|
||||
const { jobId, requestId, ticker } = request;
|
||||
@@ -247,6 +442,7 @@ class IngestorWorker {
|
||||
|
||||
if (this.metadataInterval) clearInterval(this.metadataInterval);
|
||||
|
||||
this.pool.shutdown();
|
||||
this.realtimePoller.shutdown();
|
||||
await this.ccxtFetcher.close();
|
||||
await this.metadataGenerator.close();
|
||||
|
||||
Reference in New Issue
Block a user