bugfixes; research subproc; higher sandbox limits

This commit is contained in:
2026-04-16 18:11:26 -04:00
parent f80c943dc3
commit 3153e89d4f
54 changed files with 1947 additions and 498 deletions

View File

@@ -3,6 +3,7 @@ package com.dexorder.flink.ingestor;
import com.dexorder.flink.zmq.ZmqChannelManager;
import com.dexorder.proto.DataRequest;
import com.dexorder.proto.RealtimeParams;
import com.dexorder.proto.SlotType;
import com.dexorder.proto.SubmitHistoricalRequest;
import com.dexorder.proto.WorkComplete;
import com.dexorder.proto.WorkHeartbeat;
@@ -17,27 +18,27 @@ import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
/**
* LRU-style work broker for ingestors.
* Slot-based work broker for ingestors.
*
* Ingestors connect via DEALER to the ROUTER socket on port 5567. They register with READY,
* are dispatched WORK messages, and respond with COMPLETE (historical) or HEARTBEAT (realtime).
* If a heartbeat times out the job is re-queued and dispatched to another available worker.
* Each WorkerReady message from an ingestor represents ONE available slot for a
* specific exchange and job type (HISTORICAL or REALTIME). Flink consumes the slot
* by dispatching a DataRequest to it. The ingestor re-offers the slot (sends another
* WorkerReady) once the job completes, subject to any rate-limit backoff.
*
* Also receives SubmitHistoricalRequest messages forwarded by the relay on the PULL socket (5566).
* Also receives SubmitHistoricalRequest messages forwarded by the relay on the PULL
* socket (5566), and realtime job requests from RealtimeSubscriptionManager.
*
* Message type IDs (ZMQ framing, not Kafka):
* 0x10 SubmitHistoricalRequest (relay → Flink via PULL, same as client wire type)
* 0x20 WorkerReady (ingestor → Flink)
* Message type IDs (ZMQ framing):
* 0x10 SubmitHistoricalRequest (relay → Flink via PULL)
* 0x20 WorkerReady (ingestor → Flink: one slot offer)
* 0x21 WorkComplete (ingestor → Flink)
* 0x22 WorkHeartbeat (ingestor → Flink)
* 0x23 WorkReject (ingestor → Flink)
@@ -53,7 +54,7 @@ public class IngestorBroker implements AutoCloseable {
private static final byte MSG_TYPE_WORK_COMPLETE = 0x21;
private static final byte MSG_TYPE_WORK_HEARTBEAT = 0x22;
private static final byte MSG_TYPE_WORK_REJECT = 0x23;
private static final byte MSG_TYPE_WORK_ASSIGN = 0x01; // DataRequest type on wire
private static final byte MSG_TYPE_WORK_ASSIGN = 0x01;
private static final byte MSG_TYPE_WORK_STOP = 0x25;
/** Re-queue realtime job if no heartbeat received within this window (ms) */
@@ -65,20 +66,20 @@ public class IngestorBroker implements AutoCloseable {
private volatile boolean running;
private Thread brokerThread;
// ── Worker tracking ──────────────────────────────────────────────────────
// ── Slot tracking ─────────────────────────────────────────────────────────
/** Workers ready to accept a job, in LRU order (head = least recently used) */
private final Deque<WorkerInfo> freeWorkers = new ArrayDeque<>();
/**
* Available slots, in LRU order (head = least recently used).
* Each entry is one WorkerReady slot offer from an ingestor.
*/
private final Deque<WorkerSlot> freeSlots = new ArrayDeque<>();
/** Jobs waiting for a compatible free worker */
/** Jobs waiting for a compatible free slot */
private final Queue<DataRequest> pendingJobs = new ArrayDeque<>();
/** Jobs currently executing on a worker */
/** Jobs currently executing on a slot */
private final Map<String, ActiveJob> activeJobs = new ConcurrentHashMap<>();
/** Worker identity → supported exchanges (set once on READY) */
private final Map<String, WorkerInfo> knownWorkers = new ConcurrentHashMap<>();
// ── Thread-safe inbound queue from RealtimeSubscriptionManager ───────────
private final Queue<DataRequest> externalSubmissions = new ConcurrentLinkedQueue<>();
@@ -134,8 +135,7 @@ public class IngestorBroker implements AutoCloseable {
/**
* Stop all realtime jobs for a ticker (called when last subscriber leaves).
* Thread-safe — posts a stop marker via externalSubmissions is complex; instead we
* directly find and stop active jobs. Protected by ConcurrentHashMap.
* Thread-safe via ConcurrentHashMap.
*/
public void stopRealtimeJobsForTicker(String ticker) {
List<String> toStop = new ArrayList<>();
@@ -154,7 +154,7 @@ public class IngestorBroker implements AutoCloseable {
}
}
// ── Broker loop ──────────────────────────────────────────────────────────
// ── Broker loop ──────────────────────────────────────────────────────────
private void brokerLoop() {
ZMQ.Socket pullSocket = zmqManager.getSocket(ZmqChannelManager.Channel.CLIENT_REQUEST);
@@ -174,18 +174,15 @@ public class IngestorBroker implements AutoCloseable {
enqueueJob(ext);
}
// Poll sockets (100ms timeout)
poller.poll(100);
if (poller.pollin(0)) {
handleClientRequest(pullSocket);
}
if (poller.pollin(1)) {
handleWorkerMessage(routerSocket);
}
// Check for heartbeat / completion timeouts
checkTimeouts();
} catch (Exception e) {
@@ -235,7 +232,8 @@ public class IngestorBroker implements AutoCloseable {
.setClientId(req.hasClientId() ? req.getClientId() : "")
.build();
enqueueJob(dataRequest);
LOG.info("Received historical request from relay: request_id={}, ticker={}", req.getRequestId(), req.getTicker());
LOG.info("Received historical request from relay: request_id={}, ticker={}",
req.getRequestId(), req.getTicker());
} catch (Exception e) {
LOG.error("Failed to parse SubmitHistoricalRequest from relay", e);
}
@@ -277,23 +275,28 @@ public class IngestorBroker implements AutoCloseable {
}
}
/**
* A WorkerReady message represents ONE slot offer for one exchange and job type.
* Add it directly to freeSlots — no deduplication (multiple slots per ingestor are expected).
*/
private void handleWorkerReady(byte[] identity, String identityKey, byte[] payload) throws Exception {
WorkerReady ready = WorkerReady.parseFrom(payload);
Set<String> exchanges = new HashSet<>(ready.getExchangesList());
SlotType slotType = ready.getJobType();
WorkerInfo worker = knownWorkers.computeIfAbsent(identityKey,
k -> new WorkerInfo(identity, identityKey, exchanges));
worker.exchanges = exchanges; // update in case re-READY with different config
worker.identity = identity;
if (!freeWorkers.contains(worker)) {
freeWorkers.addLast(worker);
for (String exchange : ready.getExchangesList()) {
WorkerSlot slot = new WorkerSlot(identity, identityKey, exchange.toUpperCase(), slotType);
freeSlots.addLast(slot);
LOG.info("Worker slot READY: id={}, exchange={}, type={}, totalFreeSlots={}",
identityKey, exchange, slotType, freeSlots.size());
}
LOG.info("Ingestor READY: id={}, exchanges={}, freeWorkers={}", identityKey, exchanges, freeWorkers.size());
dispatchPending();
}
/**
* Historical job completed. Remove from activeJobs.
* The ingestor will send a new typed WorkerReady to re-offer the slot.
*/
private void handleWorkComplete(String identityKey, byte[] payload) throws Exception {
WorkComplete complete = WorkComplete.parseFrom(payload);
String jobId = complete.getJobId();
@@ -304,13 +307,7 @@ public class IngestorBroker implements AutoCloseable {
} else {
LOG.info("Job COMPLETE: jobId={}, ticker={}, success={}", jobId, job.ticker, complete.getSuccess());
}
// Worker is free again
WorkerInfo worker = knownWorkers.get(identityKey);
if (worker != null) {
freeWorkers.addLast(worker);
dispatchPending();
}
// Slot re-registration is driven by the ingestor via a new WorkerReady.
}
private void handleWorkHeartbeat(String identityKey, byte[] payload) throws Exception {
@@ -325,6 +322,10 @@ public class IngestorBroker implements AutoCloseable {
}
}
/**
* Ingestor rejected the job. Re-queue it with a new ID.
* The ingestor will send a new typed WorkerReady when it's ready again.
*/
private void handleWorkReject(String identityKey, byte[] payload) throws Exception {
WorkReject reject = WorkReject.parseFrom(payload);
String jobId = reject.getJobId();
@@ -332,31 +333,23 @@ public class IngestorBroker implements AutoCloseable {
ActiveJob job = activeJobs.remove(jobId);
if (job != null) {
// Re-queue with fresh job_id so a different ingestor may pick it up
DataRequest requeued = job.request.toBuilder()
.setJobId(UUID.randomUUID().toString())
.build();
pendingJobs.add(requeued);
}
// Worker is still free (it rejected, not crashed)
WorkerInfo worker = knownWorkers.get(identityKey);
if (worker != null) {
freeWorkers.addLast(worker);
dispatchPending();
}
// Slot re-registration is driven by the ingestor via a new WorkerReady.
}
// ── Dispatch ─────────────────────────────────────────────────────────────
// ── Dispatch ─────────────────────────────────────────────────────────────
private void enqueueJob(DataRequest request) {
// Check if we can immediately dispatch
WorkerInfo worker = findFreeWorker(exchangeOf(request.getTicker()));
if (worker != null) {
dispatch(worker, request);
WorkerSlot slot = findFreeSlot(exchangeOf(request.getTicker()), request.getType());
if (slot != null) {
dispatch(slot, request);
} else {
pendingJobs.add(request);
LOG.debug("No free worker for {}, queued (pendingJobs={})", request.getTicker(), pendingJobs.size());
LOG.debug("No free slot for {}, queued (pendingJobs={})", request.getTicker(), pendingJobs.size());
}
}
@@ -364,9 +357,9 @@ public class IngestorBroker implements AutoCloseable {
Queue<DataRequest> remaining = new ArrayDeque<>();
DataRequest job;
while ((job = pendingJobs.poll()) != null) {
WorkerInfo worker = findFreeWorker(exchangeOf(job.getTicker()));
if (worker != null) {
dispatch(worker, job);
WorkerSlot slot = findFreeSlot(exchangeOf(job.getTicker()), job.getType());
if (slot != null) {
dispatch(slot, job);
} else {
remaining.add(job);
}
@@ -374,28 +367,30 @@ public class IngestorBroker implements AutoCloseable {
pendingJobs.addAll(remaining);
}
private void dispatch(WorkerInfo worker, DataRequest request) {
freeWorkers.remove(worker);
private void dispatch(WorkerSlot slot, DataRequest request) {
try {
byte[] protoBytes = request.toByteArray();
boolean sent = zmqManager.sendToWorker(worker.identity, PROTOCOL_VERSION, MSG_TYPE_WORK_ASSIGN, protoBytes);
boolean sent = zmqManager.sendToWorker(slot.identity, PROTOCOL_VERSION, MSG_TYPE_WORK_ASSIGN, protoBytes);
if (!sent) {
LOG.error("Failed to dispatch job to worker={}, re-queuing", worker.identityKey);
freeWorkers.addLast(worker);
// ROUTER_MANDATORY: identity is disconnected — purge all stale slots for this
// worker and re-queue the job so dispatchPending() can try a live slot.
int purged = purgeWorkerSlots(slot.identityKey);
LOG.warn("Worker {} unreachable, purged {} stale free slots, re-queuing job={}",
slot.identityKey, purged, request.getJobId());
pendingJobs.add(request);
return;
}
ActiveJob active = new ActiveJob(worker.identity, worker.identityKey,
ActiveJob active = new ActiveJob(slot.identity, slot.identityKey,
request, request.getTicker(), request.getType());
activeJobs.put(request.getJobId(), active);
LOG.info("Dispatched job: jobId={}, ticker={}, type={}, worker={}",
request.getJobId(), request.getTicker(), request.getType(), worker.identityKey);
LOG.info("Dispatched job: jobId={}, ticker={}, type={}, worker={}, slotType={}",
request.getJobId(), request.getTicker(), request.getType(),
slot.identityKey, slot.slotType);
} catch (Exception e) {
LOG.error("Error dispatching job", e);
freeWorkers.addLast(worker);
freeSlots.addLast(slot);
}
}
@@ -408,7 +403,7 @@ public class IngestorBroker implements AutoCloseable {
}
}
// ── Timeout checking ─────────────────────────────────────────────────────
// ── Timeout checking ─────────────────────────────────────────────────────
private void checkTimeouts() {
long now = System.currentTimeMillis();
@@ -426,10 +421,9 @@ public class IngestorBroker implements AutoCloseable {
for (String jobId : timedOut) {
ActiveJob job = activeJobs.remove(jobId);
if (job == null) continue;
LOG.warn("Job timed out (no heartbeat/completion): jobId={}, ticker={}, type={}, worker={}",
LOG.warn("Job timed out: jobId={}, ticker={}, type={}, worker={}",
jobId, job.ticker, job.type, job.workerIdentityKey);
// Re-queue with a new job_id
DataRequest requeued = job.request.toBuilder()
.setJobId(UUID.randomUUID().toString())
.build();
@@ -438,7 +432,7 @@ public class IngestorBroker implements AutoCloseable {
}
}
// ── Helpers ──────────────────────────────────────────────────────────────
// ── Helpers ──────────────────────────────────────────────────────────────
/** Extract exchange name from ticker, e.g. "BTC/USDT.BINANCE" → "BINANCE" */
private static String exchangeOf(String ticker) {
@@ -446,12 +440,32 @@ public class IngestorBroker implements AutoCloseable {
return dot >= 0 ? ticker.substring(dot + 1).toUpperCase() : "";
}
/** Find and remove a free worker that supports the given exchange. */
private WorkerInfo findFreeWorker(String exchange) {
for (WorkerInfo w : freeWorkers) {
if (exchange.isEmpty() || w.exchanges.contains(exchange)) {
freeWorkers.remove(w);
return w;
/**
* Remove all free slots offered by a given worker identity.
* Called when a dispatch to that identity fails (ROUTER_MANDATORY unreachable).
* Returns the number of slots removed.
*/
private int purgeWorkerSlots(String identityKey) {
int before = freeSlots.size();
freeSlots.removeIf(slot -> slot.identityKey.equals(identityKey));
return before - freeSlots.size();
}
/**
* Find and remove a free slot that supports the given exchange and request type.
* A slot with SlotType.ANY matches any request type.
*/
private WorkerSlot findFreeSlot(String exchange, DataRequest.RequestType requestType) {
for (WorkerSlot slot : freeSlots) {
boolean exchangeMatch = exchange.isEmpty() || slot.exchange.equals(exchange);
boolean typeMatch = slot.slotType == SlotType.ANY
|| (slot.slotType == SlotType.HISTORICAL
&& requestType == DataRequest.RequestType.HISTORICAL_OHLC)
|| (slot.slotType == SlotType.REALTIME
&& requestType == DataRequest.RequestType.REALTIME_TICKS);
if (exchangeMatch && typeMatch) {
freeSlots.remove(slot);
return slot;
}
}
return null;
@@ -468,17 +482,20 @@ public class IngestorBroker implements AutoCloseable {
stop();
}
// ── Inner types ──────────────────────────────────────────────────────────
// ── Inner types ──────────────────────────────────────────────────────────
private static class WorkerInfo {
byte[] identity;
/** One available work slot offered by an ingestor via WorkerReady. */
private static class WorkerSlot {
final byte[] identity;
final String identityKey;
Set<String> exchanges;
final String exchange;
final SlotType slotType;
WorkerInfo(byte[] identity, String identityKey, Set<String> exchanges) {
WorkerSlot(byte[] identity, String identityKey, String exchange, SlotType slotType) {
this.identity = identity;
this.identityKey = identityKey;
this.exchanges = exchanges;
this.exchange = exchange;
this.slotType = slotType;
}
}

View File

@@ -87,6 +87,11 @@ public class ZmqChannelManager implements Closeable {
socket.setLinger(1000);
socket.setSndHWM(10000);
socket.setRcvHWM(10000);
if (socketType == SocketType.ROUTER) {
// Return false (EHOSTUNREACH) instead of silently dropping messages to
// unknown/disconnected peer identities. Enables immediate stale-slot detection.
socket.setRouterMandatory(true);
}
socket.bind(endpoint);
sockets.put(channel.name(), socket);
LOG.info("Bound {} to {}", description, endpoint);