bugfixes; research subproc; higher sandbox limits
This commit is contained in:
@@ -3,6 +3,7 @@ package com.dexorder.flink.ingestor;
|
||||
import com.dexorder.flink.zmq.ZmqChannelManager;
|
||||
import com.dexorder.proto.DataRequest;
|
||||
import com.dexorder.proto.RealtimeParams;
|
||||
import com.dexorder.proto.SlotType;
|
||||
import com.dexorder.proto.SubmitHistoricalRequest;
|
||||
import com.dexorder.proto.WorkComplete;
|
||||
import com.dexorder.proto.WorkHeartbeat;
|
||||
@@ -17,27 +18,27 @@ import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Deque;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
||||
/**
|
||||
* LRU-style work broker for ingestors.
|
||||
* Slot-based work broker for ingestors.
|
||||
*
|
||||
* Ingestors connect via DEALER to the ROUTER socket on port 5567. They register with READY,
|
||||
* are dispatched WORK messages, and respond with COMPLETE (historical) or HEARTBEAT (realtime).
|
||||
* If a heartbeat times out the job is re-queued and dispatched to another available worker.
|
||||
* Each WorkerReady message from an ingestor represents ONE available slot for a
|
||||
* specific exchange and job type (HISTORICAL or REALTIME). Flink consumes the slot
|
||||
* by dispatching a DataRequest to it. The ingestor re-offers the slot (sends another
|
||||
* WorkerReady) once the job completes, subject to any rate-limit backoff.
|
||||
*
|
||||
* Also receives SubmitHistoricalRequest messages forwarded by the relay on the PULL socket (5566).
|
||||
* Also receives SubmitHistoricalRequest messages forwarded by the relay on the PULL
|
||||
* socket (5566), and realtime job requests from RealtimeSubscriptionManager.
|
||||
*
|
||||
* Message type IDs (ZMQ framing, not Kafka):
|
||||
* 0x10 SubmitHistoricalRequest (relay → Flink via PULL, same as client wire type)
|
||||
* 0x20 WorkerReady (ingestor → Flink)
|
||||
* Message type IDs (ZMQ framing):
|
||||
* 0x10 SubmitHistoricalRequest (relay → Flink via PULL)
|
||||
* 0x20 WorkerReady (ingestor → Flink: one slot offer)
|
||||
* 0x21 WorkComplete (ingestor → Flink)
|
||||
* 0x22 WorkHeartbeat (ingestor → Flink)
|
||||
* 0x23 WorkReject (ingestor → Flink)
|
||||
@@ -53,7 +54,7 @@ public class IngestorBroker implements AutoCloseable {
|
||||
private static final byte MSG_TYPE_WORK_COMPLETE = 0x21;
|
||||
private static final byte MSG_TYPE_WORK_HEARTBEAT = 0x22;
|
||||
private static final byte MSG_TYPE_WORK_REJECT = 0x23;
|
||||
private static final byte MSG_TYPE_WORK_ASSIGN = 0x01; // DataRequest type on wire
|
||||
private static final byte MSG_TYPE_WORK_ASSIGN = 0x01;
|
||||
private static final byte MSG_TYPE_WORK_STOP = 0x25;
|
||||
|
||||
/** Re-queue realtime job if no heartbeat received within this window (ms) */
|
||||
@@ -65,20 +66,20 @@ public class IngestorBroker implements AutoCloseable {
|
||||
private volatile boolean running;
|
||||
private Thread brokerThread;
|
||||
|
||||
// ── Worker tracking ──────────────────────────────────────────────────────
|
||||
// ── Slot tracking ─────────────────────────────────────────────────────────
|
||||
|
||||
/** Workers ready to accept a job, in LRU order (head = least recently used) */
|
||||
private final Deque<WorkerInfo> freeWorkers = new ArrayDeque<>();
|
||||
/**
|
||||
* Available slots, in LRU order (head = least recently used).
|
||||
* Each entry is one WorkerReady slot offer from an ingestor.
|
||||
*/
|
||||
private final Deque<WorkerSlot> freeSlots = new ArrayDeque<>();
|
||||
|
||||
/** Jobs waiting for a compatible free worker */
|
||||
/** Jobs waiting for a compatible free slot */
|
||||
private final Queue<DataRequest> pendingJobs = new ArrayDeque<>();
|
||||
|
||||
/** Jobs currently executing on a worker */
|
||||
/** Jobs currently executing on a slot */
|
||||
private final Map<String, ActiveJob> activeJobs = new ConcurrentHashMap<>();
|
||||
|
||||
/** Worker identity → supported exchanges (set once on READY) */
|
||||
private final Map<String, WorkerInfo> knownWorkers = new ConcurrentHashMap<>();
|
||||
|
||||
// ── Thread-safe inbound queue from RealtimeSubscriptionManager ───────────
|
||||
|
||||
private final Queue<DataRequest> externalSubmissions = new ConcurrentLinkedQueue<>();
|
||||
@@ -134,8 +135,7 @@ public class IngestorBroker implements AutoCloseable {
|
||||
|
||||
/**
|
||||
* Stop all realtime jobs for a ticker (called when last subscriber leaves).
|
||||
* Thread-safe — posts a stop marker via externalSubmissions is complex; instead we
|
||||
* directly find and stop active jobs. Protected by ConcurrentHashMap.
|
||||
* Thread-safe via ConcurrentHashMap.
|
||||
*/
|
||||
public void stopRealtimeJobsForTicker(String ticker) {
|
||||
List<String> toStop = new ArrayList<>();
|
||||
@@ -154,7 +154,7 @@ public class IngestorBroker implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Broker loop ──────────────────────────────────────────────────────────
|
||||
// ── Broker loop ───────────────────────────────────────────────────────────
|
||||
|
||||
private void brokerLoop() {
|
||||
ZMQ.Socket pullSocket = zmqManager.getSocket(ZmqChannelManager.Channel.CLIENT_REQUEST);
|
||||
@@ -174,18 +174,15 @@ public class IngestorBroker implements AutoCloseable {
|
||||
enqueueJob(ext);
|
||||
}
|
||||
|
||||
// Poll sockets (100ms timeout)
|
||||
poller.poll(100);
|
||||
|
||||
if (poller.pollin(0)) {
|
||||
handleClientRequest(pullSocket);
|
||||
}
|
||||
|
||||
if (poller.pollin(1)) {
|
||||
handleWorkerMessage(routerSocket);
|
||||
}
|
||||
|
||||
// Check for heartbeat / completion timeouts
|
||||
checkTimeouts();
|
||||
|
||||
} catch (Exception e) {
|
||||
@@ -235,7 +232,8 @@ public class IngestorBroker implements AutoCloseable {
|
||||
.setClientId(req.hasClientId() ? req.getClientId() : "")
|
||||
.build();
|
||||
enqueueJob(dataRequest);
|
||||
LOG.info("Received historical request from relay: request_id={}, ticker={}", req.getRequestId(), req.getTicker());
|
||||
LOG.info("Received historical request from relay: request_id={}, ticker={}",
|
||||
req.getRequestId(), req.getTicker());
|
||||
} catch (Exception e) {
|
||||
LOG.error("Failed to parse SubmitHistoricalRequest from relay", e);
|
||||
}
|
||||
@@ -277,23 +275,28 @@ public class IngestorBroker implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A WorkerReady message represents ONE slot offer for one exchange and job type.
|
||||
* Add it directly to freeSlots — no deduplication (multiple slots per ingestor are expected).
|
||||
*/
|
||||
private void handleWorkerReady(byte[] identity, String identityKey, byte[] payload) throws Exception {
|
||||
WorkerReady ready = WorkerReady.parseFrom(payload);
|
||||
Set<String> exchanges = new HashSet<>(ready.getExchangesList());
|
||||
SlotType slotType = ready.getJobType();
|
||||
|
||||
WorkerInfo worker = knownWorkers.computeIfAbsent(identityKey,
|
||||
k -> new WorkerInfo(identity, identityKey, exchanges));
|
||||
worker.exchanges = exchanges; // update in case re-READY with different config
|
||||
worker.identity = identity;
|
||||
|
||||
if (!freeWorkers.contains(worker)) {
|
||||
freeWorkers.addLast(worker);
|
||||
for (String exchange : ready.getExchangesList()) {
|
||||
WorkerSlot slot = new WorkerSlot(identity, identityKey, exchange.toUpperCase(), slotType);
|
||||
freeSlots.addLast(slot);
|
||||
LOG.info("Worker slot READY: id={}, exchange={}, type={}, totalFreeSlots={}",
|
||||
identityKey, exchange, slotType, freeSlots.size());
|
||||
}
|
||||
LOG.info("Ingestor READY: id={}, exchanges={}, freeWorkers={}", identityKey, exchanges, freeWorkers.size());
|
||||
|
||||
dispatchPending();
|
||||
}
|
||||
|
||||
/**
|
||||
* Historical job completed. Remove from activeJobs.
|
||||
* The ingestor will send a new typed WorkerReady to re-offer the slot.
|
||||
*/
|
||||
private void handleWorkComplete(String identityKey, byte[] payload) throws Exception {
|
||||
WorkComplete complete = WorkComplete.parseFrom(payload);
|
||||
String jobId = complete.getJobId();
|
||||
@@ -304,13 +307,7 @@ public class IngestorBroker implements AutoCloseable {
|
||||
} else {
|
||||
LOG.info("Job COMPLETE: jobId={}, ticker={}, success={}", jobId, job.ticker, complete.getSuccess());
|
||||
}
|
||||
|
||||
// Worker is free again
|
||||
WorkerInfo worker = knownWorkers.get(identityKey);
|
||||
if (worker != null) {
|
||||
freeWorkers.addLast(worker);
|
||||
dispatchPending();
|
||||
}
|
||||
// Slot re-registration is driven by the ingestor via a new WorkerReady.
|
||||
}
|
||||
|
||||
private void handleWorkHeartbeat(String identityKey, byte[] payload) throws Exception {
|
||||
@@ -325,6 +322,10 @@ public class IngestorBroker implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ingestor rejected the job. Re-queue it with a new ID.
|
||||
* The ingestor will send a new typed WorkerReady when it's ready again.
|
||||
*/
|
||||
private void handleWorkReject(String identityKey, byte[] payload) throws Exception {
|
||||
WorkReject reject = WorkReject.parseFrom(payload);
|
||||
String jobId = reject.getJobId();
|
||||
@@ -332,31 +333,23 @@ public class IngestorBroker implements AutoCloseable {
|
||||
|
||||
ActiveJob job = activeJobs.remove(jobId);
|
||||
if (job != null) {
|
||||
// Re-queue with fresh job_id so a different ingestor may pick it up
|
||||
DataRequest requeued = job.request.toBuilder()
|
||||
.setJobId(UUID.randomUUID().toString())
|
||||
.build();
|
||||
pendingJobs.add(requeued);
|
||||
}
|
||||
|
||||
// Worker is still free (it rejected, not crashed)
|
||||
WorkerInfo worker = knownWorkers.get(identityKey);
|
||||
if (worker != null) {
|
||||
freeWorkers.addLast(worker);
|
||||
dispatchPending();
|
||||
}
|
||||
// Slot re-registration is driven by the ingestor via a new WorkerReady.
|
||||
}
|
||||
|
||||
// ── Dispatch ─────────────────────────────────────────────────────────────
|
||||
// ── Dispatch ──────────────────────────────────────────────────────────────
|
||||
|
||||
private void enqueueJob(DataRequest request) {
|
||||
// Check if we can immediately dispatch
|
||||
WorkerInfo worker = findFreeWorker(exchangeOf(request.getTicker()));
|
||||
if (worker != null) {
|
||||
dispatch(worker, request);
|
||||
WorkerSlot slot = findFreeSlot(exchangeOf(request.getTicker()), request.getType());
|
||||
if (slot != null) {
|
||||
dispatch(slot, request);
|
||||
} else {
|
||||
pendingJobs.add(request);
|
||||
LOG.debug("No free worker for {}, queued (pendingJobs={})", request.getTicker(), pendingJobs.size());
|
||||
LOG.debug("No free slot for {}, queued (pendingJobs={})", request.getTicker(), pendingJobs.size());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -364,9 +357,9 @@ public class IngestorBroker implements AutoCloseable {
|
||||
Queue<DataRequest> remaining = new ArrayDeque<>();
|
||||
DataRequest job;
|
||||
while ((job = pendingJobs.poll()) != null) {
|
||||
WorkerInfo worker = findFreeWorker(exchangeOf(job.getTicker()));
|
||||
if (worker != null) {
|
||||
dispatch(worker, job);
|
||||
WorkerSlot slot = findFreeSlot(exchangeOf(job.getTicker()), job.getType());
|
||||
if (slot != null) {
|
||||
dispatch(slot, job);
|
||||
} else {
|
||||
remaining.add(job);
|
||||
}
|
||||
@@ -374,28 +367,30 @@ public class IngestorBroker implements AutoCloseable {
|
||||
pendingJobs.addAll(remaining);
|
||||
}
|
||||
|
||||
private void dispatch(WorkerInfo worker, DataRequest request) {
|
||||
freeWorkers.remove(worker);
|
||||
|
||||
private void dispatch(WorkerSlot slot, DataRequest request) {
|
||||
try {
|
||||
byte[] protoBytes = request.toByteArray();
|
||||
boolean sent = zmqManager.sendToWorker(worker.identity, PROTOCOL_VERSION, MSG_TYPE_WORK_ASSIGN, protoBytes);
|
||||
boolean sent = zmqManager.sendToWorker(slot.identity, PROTOCOL_VERSION, MSG_TYPE_WORK_ASSIGN, protoBytes);
|
||||
if (!sent) {
|
||||
LOG.error("Failed to dispatch job to worker={}, re-queuing", worker.identityKey);
|
||||
freeWorkers.addLast(worker);
|
||||
// ROUTER_MANDATORY: identity is disconnected — purge all stale slots for this
|
||||
// worker and re-queue the job so dispatchPending() can try a live slot.
|
||||
int purged = purgeWorkerSlots(slot.identityKey);
|
||||
LOG.warn("Worker {} unreachable, purged {} stale free slots, re-queuing job={}",
|
||||
slot.identityKey, purged, request.getJobId());
|
||||
pendingJobs.add(request);
|
||||
return;
|
||||
}
|
||||
|
||||
ActiveJob active = new ActiveJob(worker.identity, worker.identityKey,
|
||||
ActiveJob active = new ActiveJob(slot.identity, slot.identityKey,
|
||||
request, request.getTicker(), request.getType());
|
||||
activeJobs.put(request.getJobId(), active);
|
||||
|
||||
LOG.info("Dispatched job: jobId={}, ticker={}, type={}, worker={}",
|
||||
request.getJobId(), request.getTicker(), request.getType(), worker.identityKey);
|
||||
LOG.info("Dispatched job: jobId={}, ticker={}, type={}, worker={}, slotType={}",
|
||||
request.getJobId(), request.getTicker(), request.getType(),
|
||||
slot.identityKey, slot.slotType);
|
||||
} catch (Exception e) {
|
||||
LOG.error("Error dispatching job", e);
|
||||
freeWorkers.addLast(worker);
|
||||
freeSlots.addLast(slot);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -408,7 +403,7 @@ public class IngestorBroker implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Timeout checking ─────────────────────────────────────────────────────
|
||||
// ── Timeout checking ──────────────────────────────────────────────────────
|
||||
|
||||
private void checkTimeouts() {
|
||||
long now = System.currentTimeMillis();
|
||||
@@ -426,10 +421,9 @@ public class IngestorBroker implements AutoCloseable {
|
||||
for (String jobId : timedOut) {
|
||||
ActiveJob job = activeJobs.remove(jobId);
|
||||
if (job == null) continue;
|
||||
LOG.warn("Job timed out (no heartbeat/completion): jobId={}, ticker={}, type={}, worker={}",
|
||||
LOG.warn("Job timed out: jobId={}, ticker={}, type={}, worker={}",
|
||||
jobId, job.ticker, job.type, job.workerIdentityKey);
|
||||
|
||||
// Re-queue with a new job_id
|
||||
DataRequest requeued = job.request.toBuilder()
|
||||
.setJobId(UUID.randomUUID().toString())
|
||||
.build();
|
||||
@@ -438,7 +432,7 @@ public class IngestorBroker implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
/** Extract exchange name from ticker, e.g. "BTC/USDT.BINANCE" → "BINANCE" */
|
||||
private static String exchangeOf(String ticker) {
|
||||
@@ -446,12 +440,32 @@ public class IngestorBroker implements AutoCloseable {
|
||||
return dot >= 0 ? ticker.substring(dot + 1).toUpperCase() : "";
|
||||
}
|
||||
|
||||
/** Find and remove a free worker that supports the given exchange. */
|
||||
private WorkerInfo findFreeWorker(String exchange) {
|
||||
for (WorkerInfo w : freeWorkers) {
|
||||
if (exchange.isEmpty() || w.exchanges.contains(exchange)) {
|
||||
freeWorkers.remove(w);
|
||||
return w;
|
||||
/**
|
||||
* Remove all free slots offered by a given worker identity.
|
||||
* Called when a dispatch to that identity fails (ROUTER_MANDATORY unreachable).
|
||||
* Returns the number of slots removed.
|
||||
*/
|
||||
private int purgeWorkerSlots(String identityKey) {
|
||||
int before = freeSlots.size();
|
||||
freeSlots.removeIf(slot -> slot.identityKey.equals(identityKey));
|
||||
return before - freeSlots.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find and remove a free slot that supports the given exchange and request type.
|
||||
* A slot with SlotType.ANY matches any request type.
|
||||
*/
|
||||
private WorkerSlot findFreeSlot(String exchange, DataRequest.RequestType requestType) {
|
||||
for (WorkerSlot slot : freeSlots) {
|
||||
boolean exchangeMatch = exchange.isEmpty() || slot.exchange.equals(exchange);
|
||||
boolean typeMatch = slot.slotType == SlotType.ANY
|
||||
|| (slot.slotType == SlotType.HISTORICAL
|
||||
&& requestType == DataRequest.RequestType.HISTORICAL_OHLC)
|
||||
|| (slot.slotType == SlotType.REALTIME
|
||||
&& requestType == DataRequest.RequestType.REALTIME_TICKS);
|
||||
if (exchangeMatch && typeMatch) {
|
||||
freeSlots.remove(slot);
|
||||
return slot;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
@@ -468,17 +482,20 @@ public class IngestorBroker implements AutoCloseable {
|
||||
stop();
|
||||
}
|
||||
|
||||
// ── Inner types ──────────────────────────────────────────────────────────
|
||||
// ── Inner types ───────────────────────────────────────────────────────────
|
||||
|
||||
private static class WorkerInfo {
|
||||
byte[] identity;
|
||||
/** One available work slot offered by an ingestor via WorkerReady. */
|
||||
private static class WorkerSlot {
|
||||
final byte[] identity;
|
||||
final String identityKey;
|
||||
Set<String> exchanges;
|
||||
final String exchange;
|
||||
final SlotType slotType;
|
||||
|
||||
WorkerInfo(byte[] identity, String identityKey, Set<String> exchanges) {
|
||||
WorkerSlot(byte[] identity, String identityKey, String exchange, SlotType slotType) {
|
||||
this.identity = identity;
|
||||
this.identityKey = identityKey;
|
||||
this.exchanges = exchanges;
|
||||
this.exchange = exchange;
|
||||
this.slotType = slotType;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -87,6 +87,11 @@ public class ZmqChannelManager implements Closeable {
|
||||
socket.setLinger(1000);
|
||||
socket.setSndHWM(10000);
|
||||
socket.setRcvHWM(10000);
|
||||
if (socketType == SocketType.ROUTER) {
|
||||
// Return false (EHOSTUNREACH) instead of silently dropping messages to
|
||||
// unknown/disconnected peer identities. Enables immediate stale-slot detection.
|
||||
socket.setRouterMandatory(true);
|
||||
}
|
||||
socket.bind(endpoint);
|
||||
sockets.put(channel.name(), socket);
|
||||
LOG.info("Bound {} to {}", description, endpoint);
|
||||
|
||||
Reference in New Issue
Block a user