data pipeline refactor and fix
This commit is contained in:
@@ -47,24 +47,22 @@ function loadConfig() {
|
||||
logger.warn({ error: error.message }, 'Could not load secrets');
|
||||
}
|
||||
|
||||
// Merge config and secrets
|
||||
return {
|
||||
// Flink ZMQ endpoints
|
||||
flink_hostname: config.flink_hostname || 'localhost',
|
||||
ingestor_work_port: config.ingestor_work_port || 5555,
|
||||
ingestor_control_port: config.ingestor_control_port || 5556,
|
||||
ingestor_broker_port: config.ingestor_broker_port || 5567,
|
||||
|
||||
// Kafka configuration
|
||||
kafka_brokers: config.kafka_brokers || ['localhost:9092'],
|
||||
kafka_topic: 'market-ohlc',
|
||||
kafka_ohlc_topic: config.kafka_ohlc_topic || 'market-ohlc',
|
||||
kafka_tick_topic: config.kafka_tick_topic || 'market-tick',
|
||||
|
||||
// Worker configuration
|
||||
max_concurrent: config.max_concurrent || 10,
|
||||
poll_interval_ms: config.poll_interval_ms || 10000,
|
||||
|
||||
// Symbol metadata configuration
|
||||
supported_exchanges: config.supported_exchanges || ['binance', 'coinbase', 'kraken'],
|
||||
symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000, // 6 hours
|
||||
symbol_metadata_interval_ms: config.symbol_metadata_interval_ms || 6 * 60 * 60 * 1000,
|
||||
|
||||
...secrets
|
||||
};
|
||||
@@ -76,11 +74,7 @@ class IngestorWorker {
|
||||
this.logger = logger;
|
||||
|
||||
this.zmqClient = new ZmqClient(config, logger.child({ component: 'zmq' }));
|
||||
this.kafkaProducer = new KafkaProducer(
|
||||
config,
|
||||
logger.child({ component: 'kafka' })
|
||||
);
|
||||
// Create metadata generator first so ccxtFetcher can use it
|
||||
this.kafkaProducer = new KafkaProducer(config, logger.child({ component: 'kafka' }));
|
||||
this.metadataGenerator = new SymbolMetadataGenerator(
|
||||
config,
|
||||
this.kafkaProducer,
|
||||
@@ -94,33 +88,27 @@ class IngestorWorker {
|
||||
this.realtimePoller = new RealtimePoller(
|
||||
this.ccxtFetcher,
|
||||
this.kafkaProducer,
|
||||
this.zmqClient,
|
||||
logger.child({ component: 'poller' })
|
||||
);
|
||||
|
||||
// Track active requests
|
||||
this.activeRequests = new Map();
|
||||
this.isShutdown = false;
|
||||
// jobId → active realtime subscription (for stop handling)
|
||||
this.activeRealtime = new Set();
|
||||
|
||||
// Metadata generation interval
|
||||
this.metadataIntervalMs = config.symbol_metadata_interval_ms;
|
||||
this.isShutdown = false;
|
||||
this.metadataInterval = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the ingestor worker
|
||||
*/
|
||||
async start() {
|
||||
this.logger.info('Starting CCXT ingestor worker');
|
||||
|
||||
// Connect to services
|
||||
await this.kafkaProducer.connect();
|
||||
await this.zmqClient.connect();
|
||||
|
||||
// Start control message listener
|
||||
this.zmqClient.startControlListener(msg => this.handleControlMessage(msg));
|
||||
// Wire event callbacks before connecting so we don't miss early messages
|
||||
this.zmqClient.onWorkAssign = req => this.handleWorkAssign(req);
|
||||
this.zmqClient.onWorkStop = jobId => this.handleWorkStop(jobId);
|
||||
|
||||
// Start work loop
|
||||
this.workLoop();
|
||||
await this.zmqClient.connect(); // also sends WorkerReady
|
||||
|
||||
// Generate symbol metadata on startup
|
||||
this.logger.info('Generating initial symbol metadata');
|
||||
@@ -140,281 +128,126 @@ class IngestorWorker {
|
||||
} catch (error) {
|
||||
this.logger.error({ error: error.message }, 'Failed to generate periodic symbol metadata');
|
||||
}
|
||||
}, this.metadataIntervalMs);
|
||||
}, this.config.symbol_metadata_interval_ms);
|
||||
|
||||
this.logger.info('Ingestor worker started successfully');
|
||||
}
|
||||
|
||||
/**
|
||||
* Main work loop - pull and process data requests
|
||||
* Handle a WorkAssign message dispatched by Flink IngestorBroker.
|
||||
* Called from the ZmqClient receive loop — do not block.
|
||||
*/
|
||||
async workLoop() {
|
||||
while (!this.isShutdown) {
|
||||
try {
|
||||
// Check if we can handle more requests
|
||||
if (this.activeRequests.size >= this.config.max_concurrent) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
continue;
|
||||
}
|
||||
handleWorkAssign(request) {
|
||||
const { jobId, requestId, type, ticker } = request;
|
||||
|
||||
// Pull next data request
|
||||
const request = await this.zmqClient.pullDataRequest();
|
||||
if (!request) {
|
||||
continue;
|
||||
}
|
||||
this.logger.info({ jobId, requestId, type, ticker }, 'Received WorkAssign');
|
||||
|
||||
// Handle request asynchronously
|
||||
this.handleDataRequest(request).catch(error => {
|
||||
this.logger.error(
|
||||
{ error: error.message, requestId: request.requestId },
|
||||
'Error handling data request'
|
||||
);
|
||||
});
|
||||
} catch (error) {
|
||||
if (!this.isShutdown) {
|
||||
this.logger.error({ error: error.message }, 'Error in work loop');
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
}
|
||||
}
|
||||
// HISTORICAL_OHLC = 0 (proto3 default, may appear as undefined or 'HISTORICAL_OHLC')
|
||||
const isHistorical = !type || type === 'HISTORICAL_OHLC' || type === 0;
|
||||
const isRealtime = type === 'REALTIME_TICKS' || type === 1;
|
||||
|
||||
if (isHistorical) {
|
||||
this.handleHistoricalRequest(request).catch(err => {
|
||||
this.logger.error({ jobId, requestId, error: err.message }, 'Unexpected error in historical handler');
|
||||
});
|
||||
} else if (isRealtime) {
|
||||
this.handleRealtimeRequest(request);
|
||||
} else {
|
||||
this.logger.warn({ jobId, type }, 'Unknown request type — rejecting');
|
||||
this.zmqClient.sendReject(jobId, `Unknown request type: ${type}`).catch(() => {});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle a data request
|
||||
* Handle WorkStop sent by Flink (e.g., all subscribers left).
|
||||
*/
|
||||
async handleDataRequest(request) {
|
||||
const { requestId: request_id, type, ticker } = request;
|
||||
|
||||
this.logger.info({ request_id, type, ticker, fullRequest: request }, 'Handling data request');
|
||||
|
||||
this.activeRequests.set(request_id, request);
|
||||
|
||||
try {
|
||||
// HISTORICAL_OHLC = 0 is the proto3 default and is omitted from the wire,
|
||||
// so protobufjs decodes it as undefined. Treat undefined as HISTORICAL_OHLC.
|
||||
const isHistorical = type === undefined || type === 'HISTORICAL_OHLC' || type === 0;
|
||||
const isRealtime = type === 'REALTIME_TICKS' || type === 1;
|
||||
|
||||
if (isHistorical) {
|
||||
await this.handleHistoricalRequest(request);
|
||||
} else if (isRealtime) {
|
||||
await this.handleRealtimeRequest(request);
|
||||
} else {
|
||||
this.logger.warn({ request_id, type, typeOf: typeof type, fullRequest: request }, 'Unknown request type');
|
||||
}
|
||||
} finally {
|
||||
// For historical requests, remove from active requests when done
|
||||
const isHistorical = type === undefined || type === 'HISTORICAL_OHLC' || type === 0;
|
||||
if (isHistorical) {
|
||||
this.activeRequests.delete(request_id);
|
||||
}
|
||||
}
|
||||
handleWorkStop(jobId) {
|
||||
this.logger.info({ jobId }, 'Received WorkStop — cancelling realtime subscription');
|
||||
this.realtimePoller.cancelSubscription(jobId);
|
||||
this.activeRealtime.delete(jobId);
|
||||
// No WorkComplete needed — Flink sent the stop, it already knows
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle historical OHLC request
|
||||
* ASYNC ARCHITECTURE: No response sent back. Data written to Kafka only.
|
||||
* Flink will process from Kafka, write to Iceberg, and publish notification.
|
||||
* Fetch historical OHLC data and write to Kafka.
|
||||
* Sends WorkComplete when done (success or error).
|
||||
*/
|
||||
async handleHistoricalRequest(request) {
|
||||
const { requestId: request_id, ticker, historical, clientId: client_id } = request;
|
||||
const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical;
|
||||
const { jobId, requestId, ticker, historical, clientId: client_id } = request;
|
||||
const { startTime: start_time, endTime: end_time, periodSeconds: period_seconds, limit } = historical || {};
|
||||
|
||||
this.logger.info(
|
||||
{ request_id, ticker, period_seconds, client_id },
|
||||
'Processing historical OHLC request (async mode - write to Kafka only)'
|
||||
);
|
||||
this.logger.info({ jobId, requestId, ticker, period_seconds }, 'Processing historical OHLC request');
|
||||
|
||||
try {
|
||||
// Fetch historical data from exchange
|
||||
const candles = await this.ccxtFetcher.fetchHistoricalOHLC(
|
||||
ticker,
|
||||
start_time,
|
||||
end_time,
|
||||
period_seconds,
|
||||
limit
|
||||
ticker, start_time, end_time, period_seconds, limit
|
||||
);
|
||||
|
||||
this.logger.info(
|
||||
{ request_id, ticker, count: candles.length },
|
||||
'Fetched data from exchange'
|
||||
);
|
||||
this.logger.info({ jobId, requestId, ticker, count: candles.length }, 'Fetched from exchange');
|
||||
|
||||
// Write to Kafka - THIS IS THE ONLY OUTPUT
|
||||
// Flink will:
|
||||
// 1. Read from Kafka
|
||||
// 2. Write to Iceberg
|
||||
// 3. Publish HistoryReadyNotification
|
||||
// 4. Client receives notification via relay pub/sub
|
||||
if (candles.length > 0) {
|
||||
// Add metadata to first candle for Flink tracking
|
||||
const enrichedCandles = candles.map((candle, idx) => ({
|
||||
...candle,
|
||||
__metadata: idx === 0 ? {
|
||||
request_id,
|
||||
client_id,
|
||||
ticker,
|
||||
period_seconds,
|
||||
start_time,
|
||||
end_time
|
||||
} : undefined
|
||||
}));
|
||||
|
||||
await this.kafkaProducer.writeOHLCs(this.config.kafka_topic, enrichedCandles);
|
||||
} else {
|
||||
// Write a marker message even if no data found
|
||||
// Flink will see this and publish a NOT_FOUND notification
|
||||
await this.kafkaProducer.writeMarker(this.config.kafka_topic, {
|
||||
request_id,
|
||||
client_id,
|
||||
ticker,
|
||||
period_seconds,
|
||||
start_time,
|
||||
end_time,
|
||||
status: 'NOT_FOUND',
|
||||
message: 'No data available for requested period'
|
||||
});
|
||||
}
|
||||
|
||||
this.logger.info(
|
||||
{ request_id, ticker, count: candles.length },
|
||||
'Completed historical OHLC request - data written to Kafka'
|
||||
);
|
||||
|
||||
// NO RESPONSE SENT - Relay is stateless, client waits for pub/sub notification
|
||||
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
{
|
||||
errorType: error.constructor?.name,
|
||||
error: error.message,
|
||||
errorUrl: error.url,
|
||||
request_id,
|
||||
ticker,
|
||||
stack: error.stack
|
||||
},
|
||||
'Failed to process historical request'
|
||||
);
|
||||
|
||||
// Write error marker to Kafka so Flink can notify client
|
||||
try {
|
||||
await this.kafkaProducer.writeMarker(this.config.kafka_topic, {
|
||||
request_id,
|
||||
client_id,
|
||||
ticker,
|
||||
period_seconds,
|
||||
start_time,
|
||||
end_time,
|
||||
status: 'ERROR',
|
||||
error_message: error.message
|
||||
});
|
||||
} catch (kafkaError) {
|
||||
this.logger.error(
|
||||
{ error: kafkaError.message, request_id },
|
||||
'Failed to write error marker to Kafka'
|
||||
);
|
||||
}
|
||||
|
||||
// Do not throw - request is handled, Flink will notify client of error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle realtime tick subscription request
|
||||
*/
|
||||
async handleRealtimeRequest(request) {
|
||||
const { requestId: request_id, ticker } = request;
|
||||
|
||||
this.logger.info(
|
||||
{ request_id, ticker },
|
||||
'Processing realtime subscription request'
|
||||
);
|
||||
|
||||
try {
|
||||
// Start realtime polling
|
||||
this.realtimePoller.startSubscription(
|
||||
request_id,
|
||||
ticker,
|
||||
this.config.kafka_topic
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
{ error: error.message, request_id, ticker },
|
||||
'Failed to start realtime subscription'
|
||||
);
|
||||
this.activeRequests.delete(request_id);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle control messages from Flink
|
||||
*/
|
||||
async handleControlMessage(message) {
|
||||
const { action, requestId: request_id } = message;
|
||||
|
||||
this.logger.info({ action, request_id }, 'Received control message');
|
||||
|
||||
switch (action) {
|
||||
case 'CANCEL':
|
||||
if (request_id) {
|
||||
// Cancel specific request
|
||||
this.realtimePoller.cancelSubscription(request_id);
|
||||
this.activeRequests.delete(request_id);
|
||||
const metadata = { request_id: requestId, client_id, ticker, period_seconds, start_time, end_time };
|
||||
const PAGE_SIZE = 1000;
|
||||
for (let i = 0; i < candles.length; i += PAGE_SIZE) {
|
||||
const page = candles.slice(i, i + PAGE_SIZE);
|
||||
const isLastPage = (i + PAGE_SIZE) >= candles.length;
|
||||
await this.kafkaProducer.writeOHLCs(this.config.kafka_ohlc_topic, page, metadata, isLastPage);
|
||||
}
|
||||
break;
|
||||
this.logger.info({ jobId, requestId, ticker, count: candles.length, pages: Math.ceil(candles.length / PAGE_SIZE) }, 'Wrote all pages to Kafka');
|
||||
} else {
|
||||
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
|
||||
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
|
||||
status: 'NOT_FOUND', message: 'No data available for requested period'
|
||||
});
|
||||
}
|
||||
|
||||
case 'SHUTDOWN':
|
||||
this.logger.info('Received shutdown signal');
|
||||
await this.shutdown();
|
||||
break;
|
||||
this.logger.info({ jobId, requestId, ticker }, 'Historical request complete — sending WorkComplete');
|
||||
await this.zmqClient.sendComplete(jobId, true);
|
||||
|
||||
case 'CONFIG_UPDATE':
|
||||
// Handle config update if needed
|
||||
this.logger.info('Received config update');
|
||||
break;
|
||||
} catch (error) {
|
||||
this.logger.error({ jobId, requestId, ticker, error: error.message }, 'Historical request failed');
|
||||
|
||||
case 'HEARTBEAT':
|
||||
// Just acknowledge heartbeat
|
||||
break;
|
||||
try {
|
||||
await this.kafkaProducer.writeMarker(this.config.kafka_ohlc_topic, {
|
||||
request_id: requestId, client_id, ticker, period_seconds, start_time, end_time,
|
||||
status: 'ERROR', error_message: error.message
|
||||
});
|
||||
} catch (kafkaErr) {
|
||||
this.logger.error({ jobId, error: kafkaErr.message }, 'Failed to write error marker to Kafka');
|
||||
}
|
||||
|
||||
default:
|
||||
this.logger.warn({ action }, 'Unknown control action');
|
||||
await this.zmqClient.sendComplete(jobId, false, error.message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get worker status
|
||||
* Start realtime tick polling for a job dispatched by Flink.
|
||||
* Ticks flow: exchange → Kafka market-tick → Flink → OHLC bars → clients.
|
||||
*/
|
||||
handleRealtimeRequest(request) {
|
||||
const { jobId, requestId, ticker } = request;
|
||||
this.logger.info({ jobId, requestId, ticker }, 'Processing realtime subscription request');
|
||||
|
||||
this.activeRealtime.add(jobId);
|
||||
this.realtimePoller.startSubscription(jobId, requestId, ticker, this.config.kafka_tick_topic);
|
||||
}
|
||||
|
||||
getStatus() {
|
||||
return {
|
||||
activeRequests: this.activeRequests.size,
|
||||
maxConcurrent: this.config.max_concurrent,
|
||||
activeRealtime: this.activeRealtime.size,
|
||||
pollerStats: this.realtimePoller.getStats(),
|
||||
metadataStatus: this.metadataGenerator.getStatus()
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Shutdown worker gracefully
|
||||
*/
|
||||
async shutdown() {
|
||||
if (this.isShutdown) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.isShutdown) return;
|
||||
this.isShutdown = true;
|
||||
this.logger.info('Shutting down ingestor worker');
|
||||
|
||||
// Stop metadata generation interval
|
||||
if (this.metadataInterval) {
|
||||
clearInterval(this.metadataInterval);
|
||||
}
|
||||
if (this.metadataInterval) clearInterval(this.metadataInterval);
|
||||
|
||||
// Stop polling
|
||||
this.realtimePoller.shutdown();
|
||||
|
||||
// Close connections
|
||||
await this.ccxtFetcher.close();
|
||||
await this.metadataGenerator.close();
|
||||
await this.kafkaProducer.disconnect();
|
||||
@@ -430,31 +263,23 @@ async function main() {
|
||||
const config = loadConfig();
|
||||
const worker = new IngestorWorker(config, logger);
|
||||
|
||||
// Handle shutdown signals
|
||||
process.on('SIGINT', () => worker.shutdown());
|
||||
process.on('SIGTERM', () => worker.shutdown());
|
||||
|
||||
// Handle errors
|
||||
process.on('uncaughtException', error => {
|
||||
logger.error({ error }, 'Uncaught exception');
|
||||
worker.shutdown();
|
||||
});
|
||||
|
||||
process.on('unhandledRejection', (reason, promise) => {
|
||||
process.on('unhandledRejection', reason => {
|
||||
logger.error({ reason }, 'Unhandled rejection');
|
||||
});
|
||||
|
||||
// Start worker
|
||||
await worker.start();
|
||||
|
||||
// Log status periodically
|
||||
setInterval(() => {
|
||||
const status = worker.getStatus();
|
||||
logger.info({ status }, 'Worker status');
|
||||
logger.info({ status: worker.getStatus() }, 'Worker status');
|
||||
}, 60000);
|
||||
}
|
||||
|
||||
// Run
|
||||
main().catch(error => {
|
||||
logger.error({ error }, 'Fatal error');
|
||||
process.exit(1);
|
||||
|
||||
Reference in New Issue
Block a user