data pipeline refactor and fix

This commit is contained in:
2026-04-13 18:30:04 -04:00
parent 6418729b16
commit 326bf80846
96 changed files with 7107 additions and 1763 deletions

View File

@@ -47,24 +47,22 @@ function loadConfig() {
logger.warn({ error: error.message }, 'Could not load secrets');
}
// Merge config and secrets
return {
// Flink ZMQ endpoints
flink_hostname: config.flink_hostname || 'localhost',
ingestor_work_port: config.ingestor_work_port || 5555,
ingestor_control_port: config.ingestor_control_port || 5556,
ingestor_broker_port: config.ingestor_broker_port || 5567,
// Kafka configuration
kafka_brokers: config.kafka_brokers || ['localhost:9092'],
kafka_topic: 'market-ohlc',
kafka_ohlc_topic: config.kafka_ohlc_topic || 'market-ohlc',
kafka_tick_topic: config.kafka_tick_topic || 'market-tick',
// Worker configuration
max_concurrent: config.max_concurrent || 10,
poll_interval_ms: config.poll_interval_ms || 10000,
// Symbol metadata configuration
supported_exchanges: config.supported_exchanges || ['binance', 'coinbase', 'kraken'],
symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000, // 6 hours
symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000,
...secrets
};
@@ -76,11 +74,7 @@ class IngestorWorker {
this.logger = logger;
this.zmqClient = new ZmqClient(config, logger.child({ component: 'zmq' }));
this.kafkaProducer = new KafkaProducer(
config,
logger.child({ component: 'kafka' })
);
// Create metadata generator first so ccxtFetcher can use it
this.kafkaProducer = new KafkaProducer(config, logger.child({ component: 'kafka' }));
this.metadataGenerator = new SymbolMetadataGenerator(
config,
this.kafkaProducer,
@@ -94,33 +88,27 @@ class IngestorWorker {
this.realtimePoller = new RealtimePoller(
this.ccxtFetcher,
this.kafkaProducer,
this.zmqClient,
logger.child({ component: 'poller' })
);
// Track active requests
this.activeRequests = new Map();
this.isShutdown = false;
// jobId → active realtime subscription (for stop handling)
this.activeRealtime = new Set();
// Metadata generation interval
this.metadataIntervalMs = config.symbol_metadata_interval_ms;
this.isShutdown = false;
this.metadataInterval = null;
}
/**
* Start the ingestor worker
*/
async start() {
this.logger.info('Starting CCXT ingestor worker');
// Connect to services
await this.kafkaProducer.connect();
await this.zmqClient.connect();
// Start control message listener
this.zmqClient.startControlListener(msg => this.handleControlMessage(msg));
// Wire event callbacks before connecting so we don't miss early messages
this.zmqClient.onWorkAssign = req => this.handleWorkAssign(req);
this.zmqClient.onWorkStop = jobId => this.handleWorkStop(jobId);
// Start work loop
this.workLoop();
await this.zmqClient.connect(); // also sends WorkerReady
// Generate symbol metadata on startup
this.logger.info('Generating initial symbol metadata');
@@ -140,281 +128,126 @@ class IngestorWorker {
} catch (error) {
this.logger.error({ error: error.message }, 'Failed to generate periodic symbol metadata');
}
}, this.metadataIntervalMs);
}, this.config.symbol_metadata_interval_ms);
this.logger.info('Ingestor worker started successfully');
}
/**
* Main work loop - pull and process data requests
* Handle a WorkAssign message dispatched by Flink IngestorBroker.
* Called from the ZmqClient receive loop — do not block.
*/
async workLoop() {
while (!this.isShutdown) {
try {
// Check if we can handle more requests
if (this.activeRequests.size >= this.config.max_concurrent) {
await new Promise(resolve => setTimeout(resolve, 1000));
continue;
}
handleWorkAssign(request) {
const { jobId, requestId, type, ticker } = request;
// Pull next data request
const request = await this.zmqClient.pullDataRequest();
if (!request) {
continue;
}
this.logger.info({ jobId, requestId, type, ticker }, 'Received WorkAssign');
// Handle request asynchronously
this.handleDataRequest(request).catch(error => {
this.logger.error(
{ error: error.message, requestId: request.requestId },
'Error handling data request'
);
});
} catch (error) {
if (!this.isShutdown) {
this.logger.error({ error: error.message }, 'Error in work loop');
await new Promise(resolve => setTimeout(resolve, 1000));
}
}
// HISTORICAL_OHLC = 0 (proto3 default, may appear as undefined or 'HISTORICAL_OHLC')
const isHistorical = !type || type === 'HISTORICAL_OHLC' || type === 0;
const isRealtime = type === 'REALTIME_TICKS' || type === 1;
if (isHistorical) {
this.handleHistoricalRequest(request).catch(err => {
this.logger.error({ jobId, requestId, error: err.message }, 'Unexpected error in historical handler');
});
} else if (isRealtime) {
this.handleRealtimeRequest(request);
} else {
this.logger.warn({ jobId, type }, 'Unknown request type — rejecting');
this.zmqClient.sendReject(jobId, `Unknown request type: ${type}`).catch(() => {});
}
}
/**
* Handle a data request
* Handle WorkStop sent by Flink (e.g., all subscribers left).
*/
async handleDataRequest(request) {
const { requestId: request_id, type, ticker } = request;
this.logger.info({ request_id, type, ticker, fullRequest: request }, 'Handling data request');
this.activeRequests.set(request_id, request);
try {
// HISTORICAL_OHLC = 0 is the proto3 default and is omitted from the wire,
// so protobufjs decodes it as undefined. Treat undefined as HISTORICAL_OHLC.
const isHistorical = type === undefined || type === 'HISTORICAL_OHLC' || type === 0;
const isRealtime = type === 'REALTIME_TICKS' || type === 1;
if (isHistorical) {
await this.handleHistoricalRequest(request);
} else if (isRealtime) {
await this.handleRealtimeRequest(request);
} else {
this.logger.warn({ request_id, type, typeOf: typeof type, fullRequest: request }, 'Unknown request type');
}
} finally {
// For historical requests, remove from active requests when done
const isHistorical = type === undefined || type === 'HISTORICAL_OHLC' || type === 0;
if (isHistorical) {
this.activeRequests.delete(request_id);
}
}
handleWorkStop(jobId) {
this.logger.info({ jobId }, 'Received WorkStop — cancelling realtime subscription');
this.realtimePoller.cancelSubscription(jobId);
this.activeRealtime.delete(jobId);
// No WorkComplete needed — Flink sent the stop, it already knows
}
/**
* Handle historical OHLC request
* ASYNC ARCHITECTURE: No response sent back. Data written to Kafka only.
* Flink will process from Kafka, write to Iceberg, and publish notification.
* Fetch historical OHLC data and write to Kafka.
* Sends WorkComplete when done (success or error).
*/
async handleHistoricalRequest(request) {
const { requestId: request_id, ticker, historical, clientId: client_id } = request;
const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical;
const { jobId, requestId, ticker, historical, clientId: client_id } = request;
const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical || {};
this.logger.info(
{ request_id, ticker, period_seconds, client_id },
'Processing historical OHLC request (async mode - write to Kafka only)'
);
this.logger.info({ jobId, requestId, ticker, period_seconds }, 'Processing historical OHLC request');
try {
// Fetch historical data from exchange
const candles = await this.ccxtFetcher.fetchHistoricalOHLC(
ticker,
start_time,
end_time,
period_seconds,
limit
ticker, start_time, end_time, period_seconds, limit
);
this.logger.info(
{ request_id, ticker, count: candles.length },
'Fetched data from exchange'
);
this.logger.info({ jobId, requestId, ticker, count: candles.length }, 'Fetched from exchange');
// Write to Kafka - THIS IS THE ONLY OUTPUT
// Flink will:
// 1. Read from Kafka
// 2. Write to Iceberg
// 3. Publish HistoryReadyNotification
// 4. Client receives notification via relay pub/sub
if (candles.length > 0) {
// Add metadata to first candle for Flink tracking
const enrichedCandles = candles.map((candle, idx) => ({
...candle,
__metadata: idx === 0 ? {
request_id,
client_id,
ticker,
period_seconds,
start_time,
end_time
} : undefined
}));
await this.kafkaProducer.writeOHLCs(this.config.kafka_topic, enrichedCandles);
} else {
// Write a marker message even if no data found
// Flink will see this and publish a NOT_FOUND notification
await this.kafkaProducer.writeMarker(this.config.kafka_topic, {
request_id,
client_id,
ticker,
period_seconds,
start_time,
end_time,
status: 'NOT_FOUND',
message: 'No data available for requested period'
});
}
this.logger.info(
{ request_id, ticker, count: candles.length },
'Completed historical OHLC request - data written to Kafka'
);
// NO RESPONSE SENT - Relay is stateless, client waits for pub/sub notification
} catch (error) {
this.logger.error(
{
errorType: error.constructor?.name,
error: error.message,
errorUrl: error.url,
request_id,
ticker,
stack: error.stack
},
'Failed to process historical request'
);
// Write error marker to Kafka so Flink can notify client
try {
await this.kafkaProducer.writeMarker(this.config.kafka_topic, {
request_id,
client_id,
ticker,
period_seconds,
start_time,
end_time,
status: 'ERROR',
error_message: error.message
});
} catch (kafkaError) {
this.logger.error(
{ error: kafkaError.message, request_id },
'Failed to write error marker to Kafka'
);
}
// Do not throw - request is handled, Flink will notify client of error
}
}
/**
* Handle realtime tick subscription request
*/
async handleRealtimeRequest(request) {
const { requestId: request_id, ticker } = request;
this.logger.info(
{ request_id, ticker },
'Processing realtime subscription request'
);
try {
// Start realtime polling
this.realtimePoller.startSubscription(
request_id,
ticker,
this.config.kafka_topic
);
} catch (error) {
this.logger.error(
{ error: error.message, request_id, ticker },
'Failed to start realtime subscription'
);
this.activeRequests.delete(request_id);
throw error;
}
}
/**
* Handle control messages from Flink
*/
async handleControlMessage(message) {
const { action, requestId: request_id } = message;
this.logger.info({ action, request_id }, 'Received control message');
switch (action) {
case 'CANCEL':
if (request_id) {
// Cancel specific request
this.realtimePoller.cancelSubscription(request_id);
this.activeRequests.delete(request_id);
const metadata = { request_id: requestId, client_id, ticker, period_seconds, start_time, end_time };
const PAGE_SIZE = 1000;
for (let i = 0; i < candles.length; i += PAGE_SIZE) {
const page = candles.slice(i, i + PAGE_SIZE);
const isLastPage = (i + PAGE_SIZE) >= candles.length;
await this.kafkaProducer.writeOHLCs(this.config.kafka_ohlc_topic, page, metadata, isLastPage);
}
break;
this.logger.info({ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) }, 'Wrote all pages to Kafka');
} else {
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
status: 'NOT_FOUND', message: 'No data available for requested period'
});
}
case 'SHUTDOWN':
this.logger.info('Received shutdown signal');
await this.shutdown();
break;
this.logger.info({ jobId, requestId, ticker }, 'Historical request complete — sending WorkComplete');
await this.zmqClient.sendComplete(jobId, true);
case 'CONFIG_UPDATE':
// Handle config update if needed
this.logger.info('Received config update');
break;
} catch (error) {
this.logger.error({ jobId, requestId, ticker, error: error.message }, 'Historical request failed');
case 'HEARTBEAT':
// Just acknowledge heartbeat
break;
try {
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
status: 'ERROR', error_message: error.message
});
} catch (kafkaErr) {
this.logger.error({ jobId, error: kafkaErr.message }, 'Failed to write error marker to Kafka');
}
default:
this.logger.warn({ action }, 'Unknown control action');
await this.zmqClient.sendComplete(jobId, false, error.message);
}
}
/**
* Get worker status
* Start realtime tick polling for a job dispatched by Flink.
* Ticks flow: exchange → Kafka market-tick → Flink → OHLC bars → clients.
*/
handleRealtimeRequest(request) {
const { jobId, requestId, ticker } = request;
this.logger.info({ jobId, requestId, ticker }, 'Processing realtime subscription request');
this.activeRealtime.add(jobId);
this.realtimePoller.startSubscription(jobId, requestId, ticker, this.config.kafka_tick_topic);
}
getStatus() {
return {
activeRequests: this.activeRequests.size,
maxConcurrent: this.config.max_concurrent,
activeRealtime: this.activeRealtime.size,
pollerStats: this.realtimePoller.getStats(),
metadataStatus: this.metadataGenerator.getStatus()
};
}
/**
* Shutdown worker gracefully
*/
async shutdown() {
if (this.isShutdown) {
return;
}
if (this.isShutdown) return;
this.isShutdown = true;
this.logger.info('Shutting down ingestor worker');
// Stop metadata generation interval
if (this.metadataInterval) {
clearInterval(this.metadataInterval);
}
if (this.metadataInterval) clearInterval(this.metadataInterval);
// Stop polling
this.realtimePoller.shutdown();
// Close connections
await this.ccxtFetcher.close();
await this.metadataGenerator.close();
await this.kafkaProducer.disconnect();
@@ -430,31 +263,23 @@ async function main() {
const config = loadConfig();
const worker = new IngestorWorker(config, logger);
// Handle shutdown signals
process.on('SIGINT', () => worker.shutdown());
process.on('SIGTERM', () => worker.shutdown());
// Handle errors
process.on('uncaughtException', error => {
logger.error({ error }, 'Uncaught exception');
worker.shutdown();
});
process.on('unhandledRejection', (reason, promise) => {
process.on('unhandledRejection', reason => {
logger.error({ reason }, 'Unhandled rejection');
});
// Start worker
await worker.start();
// Log status periodically
setInterval(() => {
const status = worker.getStatus();
logger.info({ status }, 'Worker status');
logger.info({ status: worker.getStatus() }, 'Worker status');
}, 60000);
}
// Run
main().catch(error => {
logger.error({ error }, 'Fatal error');
process.exit(1);