container lifecycle management
This commit is contained in:
1
.idea/ai.iml
generated
1
.idea/ai.iml
generated
@@ -8,6 +8,7 @@
|
|||||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/backend.old/data" />
|
<excludeFolder url="file://$MODULE_DIR$/backend.old/data" />
|
||||||
<excludeFolder url="file://$MODULE_DIR$/doc.old" />
|
<excludeFolder url="file://$MODULE_DIR$/doc.old" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/backend.old" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Python 3.12 (ai)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Python 3.12 (ai)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
|||||||
15
AGENT.md
Normal file
15
AGENT.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
We're building an AI-first trading platform by integrating user-facing TradingView charts and chat with an AI assistant that helps do research, develop indicators (signals), and write strategies, using the Dexorder trading framework we provide.
|
||||||
|
|
||||||
|
This monorepo has:
|
||||||
|
bin/ scripts, mostly build and deploy
|
||||||
|
deploy/ kubernetes deployment and configuration
|
||||||
|
doc/ documentation
|
||||||
|
flink/ Apache Flink application mode processes data from Kafka
|
||||||
|
iceberg/ Apache Iceberg for historical OHLC etc
|
||||||
|
ingestor/ Data sources publish to Kafka
|
||||||
|
kafka/ Apache Kafka
|
||||||
|
protobuf/ Messaging entities
|
||||||
|
relay/ Rust+ZeroMQ stateless router
|
||||||
|
web/ Vue 3 / Pinia / PrimeVue / TradingView
|
||||||
|
|
||||||
|
See doc/protocol.md for messaging architecture
|
||||||
@@ -4,6 +4,7 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
DIR="$(cd "$(dirname "$0")" && pwd)"
|
DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
ROOT_DIR="$(cd "$DIR/.." && pwd)"
|
||||||
|
|
||||||
echo "Building all container images..."
|
echo "Building all container images..."
|
||||||
echo
|
echo
|
||||||
@@ -13,5 +14,31 @@ echo
|
|||||||
"$DIR/build" ingestor "$@"
|
"$DIR/build" ingestor "$@"
|
||||||
"$DIR/build" web "$@"
|
"$DIR/build" web "$@"
|
||||||
|
|
||||||
|
# Build lifecycle-sidecar (Go binary, no protobuf sync needed)
|
||||||
|
echo "Building lifecycle-sidecar..."
|
||||||
|
cd "$ROOT_DIR/lifecycle-sidecar"
|
||||||
|
|
||||||
|
# Determine tag
|
||||||
|
if [ "$1" == "dev" ]; then
|
||||||
|
TAG="dev$(date +%Y%m%d%H%M%S)"
|
||||||
|
else
|
||||||
|
# Check for uncommitted changes
|
||||||
|
DIRTY="$(git status | grep 'Changes ' || true)"
|
||||||
|
if [ "$DIRTY" != "" ]; then
|
||||||
|
echo "lifecycle-sidecar has uncommitted changes."
|
||||||
|
echo "Use '$0 dev' to build a development-tagged version instead."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
TAG="$(git log --oneline | head -1 | cut -d ' ' -f 1)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
REMOTE=${REMOTE:-ghcr.io/dexorder}
|
||||||
|
|
||||||
|
docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$TAG .
|
||||||
|
docker tag lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:$TAG
|
||||||
|
docker tag $REMOTE/lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:latest
|
||||||
|
|
||||||
|
echo "$(date)" built $REMOTE/lifecycle-sidecar:$TAG
|
||||||
|
|
||||||
echo
|
echo
|
||||||
echo "All images built successfully!"
|
echo "All images built successfully!"
|
||||||
|
|||||||
17
bin/dev
17
bin/dev
@@ -19,7 +19,7 @@ usage() {
|
|||||||
echo "Commands:"
|
echo "Commands:"
|
||||||
echo " start Start minikube and deploy all services"
|
echo " start Start minikube and deploy all services"
|
||||||
echo " stop Stop minikube"
|
echo " stop Stop minikube"
|
||||||
echo " restart [svc] Rebuild and redeploy all services, or just one (relay|ingestor|flink)"
|
echo " restart [svc] Rebuild and redeploy all services, or just one (relay|ingestor|flink|sidecar)"
|
||||||
echo " rebuild [svc] Rebuild all custom images, or just one"
|
echo " rebuild [svc] Rebuild all custom images, or just one"
|
||||||
echo " deploy [svc] Deploy/update all services, or just one"
|
echo " deploy [svc] Deploy/update all services, or just one"
|
||||||
echo " status Show status of all services"
|
echo " status Show status of all services"
|
||||||
@@ -127,12 +127,23 @@ rebuild_images() {
|
|||||||
docker tag "dexorder/ai-flink:$FLINK_TAG" "dexorder/flink:$FLINK_TAG"
|
docker tag "dexorder/ai-flink:$FLINK_TAG" "dexorder/flink:$FLINK_TAG"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Save the tags for deployment (all three, preserving any we didn't rebuild)
|
# Build lifecycle-sidecar (Go binary)
|
||||||
|
if [ "$service" == "all" ] || [ "$service" == "lifecycle-sidecar" ] || [ "$service" == "sidecar" ]; then
|
||||||
|
echo -e "${GREEN}→${NC} Building lifecycle-sidecar..."
|
||||||
|
cd "$ROOT_DIR/lifecycle-sidecar"
|
||||||
|
SIDECAR_TAG="dev$(date +%Y%m%d%H%M%S)"
|
||||||
|
docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$SIDECAR_TAG . || exit 1
|
||||||
|
echo -e "${GREEN}✓ Built lifecycle-sidecar:$SIDECAR_TAG${NC}"
|
||||||
|
cd "$ROOT_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Save the tags for deployment (all services, preserving any we didn't rebuild)
|
||||||
echo "RELAY_TAG=$RELAY_TAG" > "$ROOT_DIR/.dev-image-tag"
|
echo "RELAY_TAG=$RELAY_TAG" > "$ROOT_DIR/.dev-image-tag"
|
||||||
echo "INGEST_TAG=$INGEST_TAG" >> "$ROOT_DIR/.dev-image-tag"
|
echo "INGEST_TAG=$INGEST_TAG" >> "$ROOT_DIR/.dev-image-tag"
|
||||||
echo "FLINK_TAG=$FLINK_TAG" >> "$ROOT_DIR/.dev-image-tag"
|
echo "FLINK_TAG=$FLINK_TAG" >> "$ROOT_DIR/.dev-image-tag"
|
||||||
|
echo "SIDECAR_TAG=$SIDECAR_TAG" >> "$ROOT_DIR/.dev-image-tag"
|
||||||
|
|
||||||
echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG${NC}"
|
echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG, sidecar=$SIDECAR_TAG${NC}"
|
||||||
}
|
}
|
||||||
|
|
||||||
deploy_services() {
|
deploy_services() {
|
||||||
|
|||||||
230
client-py/dexorder/lifecycle_manager.py
Normal file
230
client-py/dexorder/lifecycle_manager.py
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
"""
|
||||||
|
Container lifecycle manager for agent containers.
|
||||||
|
|
||||||
|
Tracks activity and triggers to determine when the container should shut down.
|
||||||
|
Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Set
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Exit code to signal clean idle shutdown to sidecar
|
||||||
|
EXIT_CODE_IDLE_SHUTDOWN = 42
|
||||||
|
|
||||||
|
# File to write exit code for sidecar to read
|
||||||
|
EXIT_CODE_FILE = Path("/var/run/agent/exit_code")
|
||||||
|
|
||||||
|
|
||||||
|
class LifecycleManager:
|
||||||
|
"""
|
||||||
|
Manages container lifecycle based on activity and triggers.
|
||||||
|
|
||||||
|
The container shuts itself down when:
|
||||||
|
1. No active triggers (data subscriptions, CEP patterns, etc.)
|
||||||
|
2. No recent user activity (MCP calls)
|
||||||
|
3. Idle timeout has elapsed
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
idle_timeout_minutes: int = 15,
|
||||||
|
check_interval_seconds: int = 60,
|
||||||
|
enable_shutdown: bool = True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize lifecycle manager.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
idle_timeout_minutes: Minutes of inactivity before shutdown
|
||||||
|
check_interval_seconds: Interval between idle checks
|
||||||
|
enable_shutdown: If False, only log idle state without exiting (for testing)
|
||||||
|
"""
|
||||||
|
self.idle_timeout = timedelta(minutes=idle_timeout_minutes)
|
||||||
|
self.check_interval = check_interval_seconds
|
||||||
|
self.enable_shutdown = enable_shutdown
|
||||||
|
|
||||||
|
self.last_activity: datetime = datetime.now()
|
||||||
|
self.active_triggers: Set[str] = set()
|
||||||
|
self._running = False
|
||||||
|
self._check_task: Optional[asyncio.Task] = None
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s",
|
||||||
|
idle_timeout_minutes,
|
||||||
|
check_interval_seconds,
|
||||||
|
enable_shutdown,
|
||||||
|
)
|
||||||
|
|
||||||
|
def record_activity(self) -> None:
|
||||||
|
"""
|
||||||
|
Record user activity (called on MCP tool/resource/prompt invocations).
|
||||||
|
Resets the idle timer.
|
||||||
|
"""
|
||||||
|
self.last_activity = datetime.now()
|
||||||
|
logger.debug("Activity recorded, idle timer reset")
|
||||||
|
|
||||||
|
def update_triggers(self, triggers: Set[str]) -> None:
|
||||||
|
"""
|
||||||
|
Update the set of active triggers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.)
|
||||||
|
"""
|
||||||
|
if triggers != self.active_triggers:
|
||||||
|
added = triggers - self.active_triggers
|
||||||
|
removed = self.active_triggers - triggers
|
||||||
|
|
||||||
|
if added:
|
||||||
|
logger.info("Triggers added: %s", added)
|
||||||
|
if removed:
|
||||||
|
logger.info("Triggers removed: %s", removed)
|
||||||
|
|
||||||
|
self.active_triggers = triggers
|
||||||
|
logger.info("Active triggers: %d", len(self.active_triggers))
|
||||||
|
|
||||||
|
def add_trigger(self, trigger_id: str) -> None:
|
||||||
|
"""Add a single trigger."""
|
||||||
|
if trigger_id not in self.active_triggers:
|
||||||
|
self.active_triggers.add(trigger_id)
|
||||||
|
logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers))
|
||||||
|
|
||||||
|
def remove_trigger(self, trigger_id: str) -> None:
|
||||||
|
"""Remove a single trigger."""
|
||||||
|
if trigger_id in self.active_triggers:
|
||||||
|
self.active_triggers.remove(trigger_id)
|
||||||
|
logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers))
|
||||||
|
|
||||||
|
def is_idle(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if container is idle and should shut down.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if no triggers and idle timeout exceeded
|
||||||
|
"""
|
||||||
|
has_triggers = len(self.active_triggers) > 0
|
||||||
|
idle_time = datetime.now() - self.last_activity
|
||||||
|
is_past_timeout = idle_time > self.idle_timeout
|
||||||
|
|
||||||
|
if has_triggers:
|
||||||
|
logger.debug("Not idle: has %d active triggers", len(self.active_triggers))
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not is_past_timeout:
|
||||||
|
logger.debug(
|
||||||
|
"Not idle: last activity %s ago (timeout: %s)",
|
||||||
|
idle_time,
|
||||||
|
self.idle_timeout,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Container is idle: no triggers and %s since last activity", idle_time
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def start(self) -> None:
|
||||||
|
"""Start the lifecycle manager background task."""
|
||||||
|
if self._running:
|
||||||
|
logger.warning("Lifecycle manager already running")
|
||||||
|
return
|
||||||
|
|
||||||
|
self._running = True
|
||||||
|
self._check_task = asyncio.create_task(self._check_loop())
|
||||||
|
logger.info("Lifecycle manager started")
|
||||||
|
|
||||||
|
async def stop(self) -> None:
|
||||||
|
"""Stop the lifecycle manager."""
|
||||||
|
self._running = False
|
||||||
|
if self._check_task:
|
||||||
|
self._check_task.cancel()
|
||||||
|
try:
|
||||||
|
await self._check_task
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
logger.info("Lifecycle manager stopped")
|
||||||
|
|
||||||
|
async def _check_loop(self) -> None:
|
||||||
|
"""Background task that periodically checks if container should shut down."""
|
||||||
|
while self._running:
|
||||||
|
try:
|
||||||
|
await asyncio.sleep(self.check_interval)
|
||||||
|
|
||||||
|
if self.is_idle():
|
||||||
|
if self.enable_shutdown:
|
||||||
|
logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN)
|
||||||
|
self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN)
|
||||||
|
|
||||||
|
# Give sidecar a moment to see the exit code file
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
# Exit with special code
|
||||||
|
os._exit(EXIT_CODE_IDLE_SHUTDOWN)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
"Container is idle but shutdown is disabled (testing mode)"
|
||||||
|
)
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("Check loop cancelled")
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error in lifecycle check loop: %s", e, exc_info=True)
|
||||||
|
|
||||||
|
def _write_exit_code(self, code: int) -> None:
|
||||||
|
"""Write exit code to shared file for sidecar to read."""
|
||||||
|
try:
|
||||||
|
EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
EXIT_CODE_FILE.write_text(str(code))
|
||||||
|
logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to write exit code file: %s", e)
|
||||||
|
|
||||||
|
def setup_signal_handlers(self) -> None:
|
||||||
|
"""
|
||||||
|
Setup signal handlers for graceful shutdown.
|
||||||
|
On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def signal_handler(signum, frame):
|
||||||
|
logger.info("Received signal %d, exiting normally", signum)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton instance for easy access across the application
|
||||||
|
_lifecycle_manager: Optional[LifecycleManager] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_lifecycle_manager() -> LifecycleManager:
|
||||||
|
"""Get or create the global lifecycle manager instance."""
|
||||||
|
global _lifecycle_manager
|
||||||
|
if _lifecycle_manager is None:
|
||||||
|
# Load configuration from environment
|
||||||
|
idle_timeout = int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15"))
|
||||||
|
check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60"))
|
||||||
|
enable_shutdown = os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
|
||||||
|
|
||||||
|
_lifecycle_manager = LifecycleManager(
|
||||||
|
idle_timeout_minutes=idle_timeout,
|
||||||
|
check_interval_seconds=check_interval,
|
||||||
|
enable_shutdown=enable_shutdown,
|
||||||
|
)
|
||||||
|
return _lifecycle_manager
|
||||||
|
|
||||||
|
|
||||||
|
async def start_lifecycle_manager() -> LifecycleManager:
|
||||||
|
"""Initialize and start the lifecycle manager."""
|
||||||
|
manager = get_lifecycle_manager()
|
||||||
|
manager.setup_signal_handlers()
|
||||||
|
await manager.start()
|
||||||
|
return manager
|
||||||
43
client-py/dexorder/mcp_auth_middleware.py
Normal file
43
client-py/dexorder/mcp_auth_middleware.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# openclaw/auth.py
|
||||||
|
|
||||||
|
class MCPAuthMiddleware:
|
||||||
|
"""Authenticates incoming MCP connections based on configured mode."""
|
||||||
|
|
||||||
|
def __init__(self, config: AuthConfig):
|
||||||
|
self.config = config
|
||||||
|
self._jwks_client = None # lazy-loaded for platform mode
|
||||||
|
|
||||||
|
async def authenticate(self, request) -> AuthContext:
|
||||||
|
match self.config.mode:
|
||||||
|
case "local":
|
||||||
|
# stdio transport or localhost-only binding
|
||||||
|
# No auth needed — if you can exec into the container,
|
||||||
|
# you're the user
|
||||||
|
return AuthContext(user_id=self.config.local_user_id,
|
||||||
|
source="local")
|
||||||
|
|
||||||
|
case "token":
|
||||||
|
# User-generated API key (standalone remote access)
|
||||||
|
token = extract_bearer_token(request)
|
||||||
|
if not verify_token_hash(token, self.config.tokens):
|
||||||
|
raise AuthError("Invalid API token")
|
||||||
|
return AuthContext(user_id=self.config.local_user_id,
|
||||||
|
source="api_key")
|
||||||
|
|
||||||
|
case "platform":
|
||||||
|
# JWT signed by the OpenClaw platform
|
||||||
|
token = extract_bearer_token(request)
|
||||||
|
claims = await self._verify_platform_jwt(token)
|
||||||
|
if claims["sub"] != self.config.expected_user_id:
|
||||||
|
raise AuthError("User ID mismatch")
|
||||||
|
return AuthContext(user_id=claims["sub"],
|
||||||
|
source="platform",
|
||||||
|
scopes=claims.get("scopes", []))
|
||||||
|
|
||||||
|
async def _verify_platform_jwt(self, token: str) -> dict:
|
||||||
|
if not self._jwks_client:
|
||||||
|
self._jwks_client = JWKSClient(self.config.platform_jwks_url)
|
||||||
|
signing_key = await self._jwks_client.get_signing_key_from_jwt(token)
|
||||||
|
return jwt.decode(token, signing_key.key,
|
||||||
|
algorithms=["RS256"],
|
||||||
|
audience="openclaw-mcp")
|
||||||
110
deploy/k8s/base/admission-policy.yaml
Normal file
110
deploy/k8s/base/admission-policy.yaml
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
# ValidatingAdmissionPolicy to restrict images in dexorder-agents namespace
|
||||||
|
# Requires Kubernetes 1.30+ (or 1.28+ with feature gate)
|
||||||
|
# This is the critical security control that prevents arbitrary image execution
|
||||||
|
# even if the gateway is compromised.
|
||||||
|
---
|
||||||
|
apiVersion: admissionregistration.k8s.io/v1
|
||||||
|
kind: ValidatingAdmissionPolicy
|
||||||
|
metadata:
|
||||||
|
name: dexorder-agent-image-policy
|
||||||
|
spec:
|
||||||
|
failurePolicy: Fail
|
||||||
|
matchConstraints:
|
||||||
|
namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/type: agents
|
||||||
|
resourceRules:
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
apiVersions: ["v1"]
|
||||||
|
resources: ["deployments"]
|
||||||
|
operations: ["CREATE", "UPDATE"]
|
||||||
|
validations:
|
||||||
|
# Only allow images from our approved registry with agent prefix
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
c.image.startsWith('ghcr.io/dexorder/agent:') ||
|
||||||
|
c.image.startsWith('ghcr.io/dexorder/agent-'))
|
||||||
|
message: "Only approved dexorder agent images are allowed in the agents namespace"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# No privileged containers
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
!has(c.securityContext) ||
|
||||||
|
!has(c.securityContext.privileged) ||
|
||||||
|
c.securityContext.privileged == false)
|
||||||
|
message: "Privileged containers are not allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# No hostPath volumes
|
||||||
|
- expression: |
|
||||||
|
!has(object.spec.template.spec.volumes) ||
|
||||||
|
object.spec.template.spec.volumes.all(v,
|
||||||
|
!has(v.hostPath))
|
||||||
|
message: "hostPath volumes are not allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# No hostNetwork
|
||||||
|
- expression: |
|
||||||
|
!has(object.spec.template.spec.hostNetwork) ||
|
||||||
|
object.spec.template.spec.hostNetwork == false
|
||||||
|
message: "hostNetwork is not allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# No hostPID
|
||||||
|
- expression: |
|
||||||
|
!has(object.spec.template.spec.hostPID) ||
|
||||||
|
object.spec.template.spec.hostPID == false
|
||||||
|
message: "hostPID is not allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# Containers must run as non-root
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
has(c.securityContext) &&
|
||||||
|
has(c.securityContext.runAsNonRoot) &&
|
||||||
|
c.securityContext.runAsNonRoot == true)
|
||||||
|
message: "Containers must run as non-root"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# Must drop all capabilities
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
has(c.securityContext) &&
|
||||||
|
has(c.securityContext.capabilities) &&
|
||||||
|
has(c.securityContext.capabilities.drop) &&
|
||||||
|
c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
|
||||||
|
message: "Containers must drop all capabilities"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# Read-only root filesystem
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
has(c.securityContext) &&
|
||||||
|
has(c.securityContext.readOnlyRootFilesystem) &&
|
||||||
|
c.securityContext.readOnlyRootFilesystem == true)
|
||||||
|
message: "Containers must have read-only root filesystem"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# Resource limits must be set
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
has(c.resources) &&
|
||||||
|
has(c.resources.limits) &&
|
||||||
|
has(c.resources.limits.memory) &&
|
||||||
|
has(c.resources.limits.cpu))
|
||||||
|
message: "Containers must have resource limits set"
|
||||||
|
reason: Forbidden
|
||||||
|
---
|
||||||
|
apiVersion: admissionregistration.k8s.io/v1
|
||||||
|
kind: ValidatingAdmissionPolicyBinding
|
||||||
|
metadata:
|
||||||
|
name: dexorder-agent-image-policy-binding
|
||||||
|
spec:
|
||||||
|
policyName: dexorder-agent-image-policy
|
||||||
|
validationActions:
|
||||||
|
- Deny
|
||||||
|
matchResources:
|
||||||
|
namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/type: agents
|
||||||
221
deploy/k8s/base/agent-deployment-example.yaml
Normal file
221
deploy/k8s/base/agent-deployment-example.yaml
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
# Example agent deployment with lifecycle sidecar
|
||||||
|
# This would be created by the gateway for each user
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: agent-user-abc123
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: agent
|
||||||
|
app.kubernetes.io/component: user-agent
|
||||||
|
dexorder.io/component: agent
|
||||||
|
dexorder.io/user-id: user-abc123
|
||||||
|
dexorder.io/deployment: agent-user-abc123
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/user-id: user-abc123
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
dexorder.io/component: agent
|
||||||
|
dexorder.io/user-id: user-abc123
|
||||||
|
dexorder.io/deployment: agent-user-abc123
|
||||||
|
spec:
|
||||||
|
serviceAccountName: agent-lifecycle
|
||||||
|
|
||||||
|
# Share PID namespace so sidecar can monitor main container
|
||||||
|
shareProcessNamespace: true
|
||||||
|
|
||||||
|
# Security context
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
|
||||||
|
containers:
|
||||||
|
# Main agent container
|
||||||
|
- name: agent
|
||||||
|
image: ghcr.io/dexorder/agent:latest
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
|
# Security context (required by admission policy)
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
|
||||||
|
# Resource limits (required by admission policy)
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "256Mi"
|
||||||
|
cpu: "100m"
|
||||||
|
limits:
|
||||||
|
memory: "1Gi"
|
||||||
|
cpu: "1000m"
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
env:
|
||||||
|
- name: USER_ID
|
||||||
|
value: "user-abc123"
|
||||||
|
- name: IDLE_TIMEOUT_MINUTES
|
||||||
|
value: "15"
|
||||||
|
- name: IDLE_CHECK_INTERVAL_SECONDS
|
||||||
|
value: "60"
|
||||||
|
- name: ENABLE_IDLE_SHUTDOWN
|
||||||
|
value: "true"
|
||||||
|
- name: MCP_SERVER_PORT
|
||||||
|
value: "3000"
|
||||||
|
- name: ZMQ_CONTROL_PORT
|
||||||
|
value: "5555"
|
||||||
|
|
||||||
|
# Ports
|
||||||
|
ports:
|
||||||
|
- name: mcp
|
||||||
|
containerPort: 3000
|
||||||
|
protocol: TCP
|
||||||
|
- name: zmq-control
|
||||||
|
containerPort: 5555
|
||||||
|
protocol: TCP
|
||||||
|
|
||||||
|
# Volume mounts
|
||||||
|
volumeMounts:
|
||||||
|
- name: agent-data
|
||||||
|
mountPath: /app/data
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
- name: shared-run
|
||||||
|
mountPath: /var/run/agent
|
||||||
|
|
||||||
|
# Liveness probe (agent's MCP server)
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: mcp
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 5
|
||||||
|
|
||||||
|
# Readiness probe
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: mcp
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
|
||||||
|
# Lifecycle sidecar
|
||||||
|
- name: lifecycle-sidecar
|
||||||
|
image: ghcr.io/dexorder/lifecycle-sidecar:latest
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
|
# Security context
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "32Mi"
|
||||||
|
cpu: "10m"
|
||||||
|
limits:
|
||||||
|
memory: "64Mi"
|
||||||
|
cpu: "50m"
|
||||||
|
|
||||||
|
# Environment variables (injected via downward API)
|
||||||
|
env:
|
||||||
|
- name: NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.namespace
|
||||||
|
- name: DEPLOYMENT_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.labels['dexorder.io/deployment']
|
||||||
|
- name: USER_TYPE
|
||||||
|
value: "free" # Gateway sets this based on license
|
||||||
|
- name: MAIN_CONTAINER_PID
|
||||||
|
value: "1" # In shared PID namespace, main container is typically PID 1
|
||||||
|
|
||||||
|
# Volume mounts
|
||||||
|
volumeMounts:
|
||||||
|
- name: shared-run
|
||||||
|
mountPath: /var/run/agent
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
# Volumes
|
||||||
|
volumes:
|
||||||
|
# Persistent data (user files, state)
|
||||||
|
- name: agent-data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: agent-user-abc123-data
|
||||||
|
|
||||||
|
# Temporary writable filesystem (read-only rootfs)
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 128Mi
|
||||||
|
|
||||||
|
# Shared between main container and sidecar
|
||||||
|
- name: shared-run
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 1Mi
|
||||||
|
|
||||||
|
# Restart policy
|
||||||
|
restartPolicy: Always
|
||||||
|
|
||||||
|
# Termination grace period
|
||||||
|
terminationGracePeriodSeconds: 30
|
||||||
|
---
|
||||||
|
# PVC for agent persistent data
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: agent-user-abc123-data
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
dexorder.io/user-id: user-abc123
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
storageClassName: standard # Or your preferred storage class
|
||||||
|
---
|
||||||
|
# Service to expose agent MCP endpoint
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: agent-user-abc123
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
dexorder.io/user-id: user-abc123
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
dexorder.io/user-id: user-abc123
|
||||||
|
ports:
|
||||||
|
- name: mcp
|
||||||
|
port: 3000
|
||||||
|
targetPort: mcp
|
||||||
|
protocol: TCP
|
||||||
|
- name: zmq-control
|
||||||
|
port: 5555
|
||||||
|
targetPort: zmq-control
|
||||||
|
protocol: TCP
|
||||||
53
deploy/k8s/base/agent-quotas.yaml
Normal file
53
deploy/k8s/base/agent-quotas.yaml
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
# Resource constraints for the dexorder-agents namespace
|
||||||
|
# These limits apply regardless of what the gateway requests
|
||||||
|
---
|
||||||
|
# LimitRange: per-container defaults and maximums
|
||||||
|
apiVersion: v1
|
||||||
|
kind: LimitRange
|
||||||
|
metadata:
|
||||||
|
name: agent-limits
|
||||||
|
namespace: dexorder-agents
|
||||||
|
spec:
|
||||||
|
limits:
|
||||||
|
# Default limits applied if deployment doesn't specify
|
||||||
|
- type: Container
|
||||||
|
default:
|
||||||
|
memory: "512Mi"
|
||||||
|
cpu: "500m"
|
||||||
|
defaultRequest:
|
||||||
|
memory: "256Mi"
|
||||||
|
cpu: "100m"
|
||||||
|
# Maximum any single container can request
|
||||||
|
max:
|
||||||
|
memory: "2Gi"
|
||||||
|
cpu: "2000m"
|
||||||
|
min:
|
||||||
|
memory: "64Mi"
|
||||||
|
cpu: "50m"
|
||||||
|
# PVC size limits
|
||||||
|
- type: PersistentVolumeClaim
|
||||||
|
max:
|
||||||
|
storage: "10Gi"
|
||||||
|
min:
|
||||||
|
storage: "100Mi"
|
||||||
|
---
|
||||||
|
# ResourceQuota: total namespace limits
|
||||||
|
# Prevents a compromised gateway from exhausting cluster resources
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ResourceQuota
|
||||||
|
metadata:
|
||||||
|
name: agent-quota
|
||||||
|
namespace: dexorder-agents
|
||||||
|
spec:
|
||||||
|
hard:
|
||||||
|
# Total compute limits for all agents combined
|
||||||
|
requests.cpu: "20"
|
||||||
|
requests.memory: "40Gi"
|
||||||
|
limits.cpu: "40"
|
||||||
|
limits.memory: "80Gi"
|
||||||
|
# Object count limits
|
||||||
|
pods: "100"
|
||||||
|
persistentvolumeclaims: "100"
|
||||||
|
services: "100"
|
||||||
|
# Storage limits
|
||||||
|
requests.storage: "500Gi"
|
||||||
65
deploy/k8s/base/gateway-rbac.yaml
Normal file
65
deploy/k8s/base/gateway-rbac.yaml
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
# RBAC for gateway to CREATE agent deployments only
|
||||||
|
# Principle of least privilege: gateway can ONLY create deployments/services/PVCs
|
||||||
|
# in the dexorder-agents namespace. Deletion is handled by the lifecycle sidecar.
|
||||||
|
# No pods, secrets, exec, or cross-namespace access.
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: gateway
|
||||||
|
namespace: dexorder-system
|
||||||
|
---
|
||||||
|
# Role scoped to dexorder-agents namespace only
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
name: agent-creator
|
||||||
|
namespace: dexorder-agents
|
||||||
|
rules:
|
||||||
|
# Deployments: create and read only (deletion handled by sidecar)
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
resources: ["deployments"]
|
||||||
|
verbs: ["create", "get", "list", "watch", "patch", "update"]
|
||||||
|
|
||||||
|
# PVCs: create and read (deletion handled by sidecar)
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["persistentvolumeclaims"]
|
||||||
|
verbs: ["create", "get", "list", "watch"]
|
||||||
|
|
||||||
|
# Services: create and manage agent MCP endpoints
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["services"]
|
||||||
|
verbs: ["create", "get", "list", "watch", "patch", "update"]
|
||||||
|
|
||||||
|
# Read-only pod access for status checks (no exec!)
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["pods"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
|
||||||
|
# Pod logs for debugging (read-only)
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["pods/log"]
|
||||||
|
verbs: ["get"]
|
||||||
|
|
||||||
|
# Explicitly NOT included:
|
||||||
|
# - deployments/delete - handled by lifecycle sidecar
|
||||||
|
# - pvc/delete - handled by lifecycle sidecar
|
||||||
|
# - services/delete - handled by lifecycle sidecar
|
||||||
|
# - pods (create/delete) - must go through deployments
|
||||||
|
# - pods/exec, pods/attach - no shell access
|
||||||
|
# - secrets, configmaps - no credential access
|
||||||
|
# - any resources in other namespaces
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
name: gateway-agent-creator
|
||||||
|
namespace: dexorder-agents
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: gateway
|
||||||
|
namespace: dexorder-system
|
||||||
|
roleRef:
|
||||||
|
kind: Role
|
||||||
|
name: agent-creator
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
@@ -1,3 +1,6 @@
|
|||||||
|
# Runtime and security initialization for dexorder AI platform
|
||||||
|
# Apply this first: kubectl apply -f init.yaml
|
||||||
|
---
|
||||||
apiVersion: node.k8s.io/v1
|
apiVersion: node.k8s.io/v1
|
||||||
kind: RuntimeClass
|
kind: RuntimeClass
|
||||||
metadata:
|
metadata:
|
||||||
|
|||||||
@@ -1,5 +1,26 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
|
|
||||||
resources: []
|
resources:
|
||||||
# ingress.yaml - removed until we have services to expose
|
# Core initialization (runtime classes)
|
||||||
|
- init.yaml
|
||||||
|
# Namespace definitions with PodSecurity labels
|
||||||
|
- namespaces.yaml
|
||||||
|
# RBAC for gateway to create agents (creation only)
|
||||||
|
- gateway-rbac.yaml
|
||||||
|
# RBAC for lifecycle sidecar (self-deletion)
|
||||||
|
- lifecycle-sidecar-rbac.yaml
|
||||||
|
# Admission policies (image restriction, security requirements)
|
||||||
|
- admission-policy.yaml
|
||||||
|
# Resource quotas and limits for agents namespace
|
||||||
|
- agent-quotas.yaml
|
||||||
|
# Network isolation policies
|
||||||
|
- network-policies.yaml
|
||||||
|
# Gateway service (uncomment when ready)
|
||||||
|
# - gateway.yaml
|
||||||
|
# Example agent deployment (for reference, not applied by default)
|
||||||
|
# - agent-deployment-example.yaml
|
||||||
|
# Services (uncomment as needed)
|
||||||
|
# - backend.yaml
|
||||||
|
# - web.yaml
|
||||||
|
# - ingress.yaml
|
||||||
|
|||||||
53
deploy/k8s/base/lifecycle-sidecar-rbac.yaml
Normal file
53
deploy/k8s/base/lifecycle-sidecar-rbac.yaml
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
# RBAC for lifecycle sidecar - allows self-deletion only
|
||||||
|
# Each agent pod gets this ServiceAccount and can only delete its own deployment
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: agent-lifecycle
|
||||||
|
namespace: dexorder-agents
|
||||||
|
---
|
||||||
|
# Role allowing deletion of deployments and PVCs
|
||||||
|
# This is scoped to the dexorder-agents namespace
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
name: agent-self-delete
|
||||||
|
namespace: dexorder-agents
|
||||||
|
rules:
|
||||||
|
# Allow getting and deleting deployments
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
resources: ["deployments"]
|
||||||
|
verbs: ["get", "delete"]
|
||||||
|
|
||||||
|
# Allow getting and deleting PVCs (for anonymous users)
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["persistentvolumeclaims"]
|
||||||
|
verbs: ["get", "delete"]
|
||||||
|
|
||||||
|
# Read-only access to pods (for status checking)
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["pods"]
|
||||||
|
verbs: ["get", "list"]
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
name: agent-self-delete
|
||||||
|
namespace: dexorder-agents
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: agent-lifecycle
|
||||||
|
namespace: dexorder-agents
|
||||||
|
roleRef:
|
||||||
|
kind: Role
|
||||||
|
name: agent-self-delete
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
---
|
||||||
|
# Additional security: ValidatingWebhookConfiguration to restrict deletion
|
||||||
|
# This ensures sidecars can only delete their own deployment
|
||||||
|
# Requires a validating webhook server (can be added later)
|
||||||
|
# For now, we rely on:
|
||||||
|
# 1. Sidecar only knowing its own deployment name (from env)
|
||||||
|
# 2. RBAC limiting to dexorder-agents namespace
|
||||||
|
# 3. Admission policy restricting deployment creation (already defined)
|
||||||
24
deploy/k8s/base/namespaces.yaml
Normal file
24
deploy/k8s/base/namespaces.yaml
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# Namespace definitions for dexorder AI platform
|
||||||
|
# - dexorder-system: gateway, flink, kafka, and other infrastructure
|
||||||
|
# - dexorder-agents: user agent containers (isolated, restricted)
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: dexorder-system
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: dexorder
|
||||||
|
dexorder.io/type: system
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: dexorder-agents
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: dexorder
|
||||||
|
dexorder.io/type: agents
|
||||||
|
# Enforce restricted pod security standards
|
||||||
|
pod-security.kubernetes.io/enforce: restricted
|
||||||
|
pod-security.kubernetes.io/enforce-version: latest
|
||||||
|
pod-security.kubernetes.io/audit: restricted
|
||||||
|
pod-security.kubernetes.io/warn: restricted
|
||||||
121
deploy/k8s/base/network-policies.yaml
Normal file
121
deploy/k8s/base/network-policies.yaml
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
# Network policies for agent isolation
|
||||||
|
# Agents can only communicate with specific services, not with each other
|
||||||
|
# or with the Kubernetes API
|
||||||
|
---
|
||||||
|
# Default deny all ingress and egress in agents namespace
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: default-deny-all
|
||||||
|
namespace: dexorder-agents
|
||||||
|
spec:
|
||||||
|
podSelector: {}
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
- Egress
|
||||||
|
---
|
||||||
|
# Allow agents to receive connections from gateway (MCP)
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-gateway-ingress
|
||||||
|
namespace: dexorder-agents
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/component: agent
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/type: system
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: gateway
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 3000 # MCP server port
|
||||||
|
- protocol: TCP
|
||||||
|
port: 5555 # ZeroMQ control channel
|
||||||
|
---
|
||||||
|
# Allow agents to connect to required services
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-agent-egress
|
||||||
|
namespace: dexorder-agents
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/component: agent
|
||||||
|
policyTypes:
|
||||||
|
- Egress
|
||||||
|
egress:
|
||||||
|
# DNS resolution (required)
|
||||||
|
- to:
|
||||||
|
- namespaceSelector: {}
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
k8s-app: kube-dns
|
||||||
|
ports:
|
||||||
|
- protocol: UDP
|
||||||
|
port: 53
|
||||||
|
- protocol: TCP
|
||||||
|
port: 53
|
||||||
|
# Gateway in system namespace (for callbacks)
|
||||||
|
- to:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/type: system
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: gateway
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8080
|
||||||
|
# Kafka/Redpanda for data subscriptions
|
||||||
|
- to:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/type: system
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: redpanda
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 9092
|
||||||
|
# External HTTPS (for exchange APIs, LLM APIs)
|
||||||
|
- to:
|
||||||
|
- ipBlock:
|
||||||
|
cidr: 0.0.0.0/0
|
||||||
|
except:
|
||||||
|
# Block access to k8s API server (common ranges)
|
||||||
|
- 10.0.0.0/8
|
||||||
|
- 172.16.0.0/12
|
||||||
|
- 192.168.0.0/16
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 443
|
||||||
|
---
|
||||||
|
# System namespace: allow ingress from agents
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-agent-callbacks
|
||||||
|
namespace: dexorder-system
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: gateway
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/type: agents
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8080
|
||||||
97
deploy/k8s/dev/admission-policy-patch.yaml
Normal file
97
deploy/k8s/dev/admission-policy-patch.yaml
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
# Dev admission policy: allow local registry images
|
||||||
|
# In dev, we also allow images from localhost/minikube registry
|
||||||
|
---
|
||||||
|
apiVersion: admissionregistration.k8s.io/v1
|
||||||
|
kind: ValidatingAdmissionPolicy
|
||||||
|
metadata:
|
||||||
|
name: dexorder-agent-image-policy
|
||||||
|
spec:
|
||||||
|
failurePolicy: Fail
|
||||||
|
matchConstraints:
|
||||||
|
namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/type: agents
|
||||||
|
resourceRules:
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
apiVersions: ["v1"]
|
||||||
|
resources: ["deployments"]
|
||||||
|
operations: ["CREATE", "UPDATE"]
|
||||||
|
validations:
|
||||||
|
# Allow local dev images in addition to production registry
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
c.image.startsWith('ghcr.io/dexorder/agent:') ||
|
||||||
|
c.image.startsWith('ghcr.io/dexorder/agent-') ||
|
||||||
|
c.image.startsWith('localhost:5000/dexorder/agent') ||
|
||||||
|
c.image.startsWith('dexorder/agent'))
|
||||||
|
message: "Only approved dexorder agent images are allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# No privileged containers
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
!has(c.securityContext) ||
|
||||||
|
!has(c.securityContext.privileged) ||
|
||||||
|
c.securityContext.privileged == false)
|
||||||
|
message: "Privileged containers are not allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# No hostPath volumes
|
||||||
|
- expression: |
|
||||||
|
!has(object.spec.template.spec.volumes) ||
|
||||||
|
object.spec.template.spec.volumes.all(v,
|
||||||
|
!has(v.hostPath))
|
||||||
|
message: "hostPath volumes are not allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# No hostNetwork
|
||||||
|
- expression: |
|
||||||
|
!has(object.spec.template.spec.hostNetwork) ||
|
||||||
|
object.spec.template.spec.hostNetwork == false
|
||||||
|
message: "hostNetwork is not allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# No hostPID
|
||||||
|
- expression: |
|
||||||
|
!has(object.spec.template.spec.hostPID) ||
|
||||||
|
object.spec.template.spec.hostPID == false
|
||||||
|
message: "hostPID is not allowed"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# Containers must run as non-root
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
has(c.securityContext) &&
|
||||||
|
has(c.securityContext.runAsNonRoot) &&
|
||||||
|
c.securityContext.runAsNonRoot == true)
|
||||||
|
message: "Containers must run as non-root"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# Must drop all capabilities
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
has(c.securityContext) &&
|
||||||
|
has(c.securityContext.capabilities) &&
|
||||||
|
has(c.securityContext.capabilities.drop) &&
|
||||||
|
c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
|
||||||
|
message: "Containers must drop all capabilities"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# Read-only root filesystem
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
has(c.securityContext) &&
|
||||||
|
has(c.securityContext.readOnlyRootFilesystem) &&
|
||||||
|
c.securityContext.readOnlyRootFilesystem == true)
|
||||||
|
message: "Containers must have read-only root filesystem"
|
||||||
|
reason: Forbidden
|
||||||
|
|
||||||
|
# Resource limits must be set
|
||||||
|
- expression: |
|
||||||
|
object.spec.template.spec.containers.all(c,
|
||||||
|
has(c.resources) &&
|
||||||
|
has(c.resources.limits) &&
|
||||||
|
has(c.resources.limits.memory) &&
|
||||||
|
has(c.resources.limits.cpu))
|
||||||
|
message: "Containers must have resource limits set"
|
||||||
|
reason: Forbidden
|
||||||
19
deploy/k8s/dev/agent-quotas-patch.yaml
Normal file
19
deploy/k8s/dev/agent-quotas-patch.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# Dev/minikube resource quota overrides
|
||||||
|
# Smaller limits appropriate for local development
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ResourceQuota
|
||||||
|
metadata:
|
||||||
|
name: agent-quota
|
||||||
|
namespace: dexorder-agents
|
||||||
|
spec:
|
||||||
|
hard:
|
||||||
|
# Reduced for minikube
|
||||||
|
requests.cpu: "4"
|
||||||
|
requests.memory: "8Gi"
|
||||||
|
limits.cpu: "8"
|
||||||
|
limits.memory: "16Gi"
|
||||||
|
pods: "20"
|
||||||
|
persistentvolumeclaims: "20"
|
||||||
|
services: "20"
|
||||||
|
requests.storage: "50Gi"
|
||||||
@@ -1,16 +1,20 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
|
|
||||||
namespace: default
|
# Note: namespaces are defined in base; workloads go to dexorder-system
|
||||||
|
namespace: dexorder-system
|
||||||
|
|
||||||
# Base resources
|
# Base resources (includes security policies)
|
||||||
resources:
|
resources:
|
||||||
- ../base
|
- ../base
|
||||||
- infrastructure.yaml
|
- infrastructure.yaml
|
||||||
|
|
||||||
# No patches needed currently
|
# Dev-specific patches
|
||||||
patches: []
|
patches:
|
||||||
# ingress-dev.yaml - removed until we have services to expose
|
# Reduced resource quotas for minikube
|
||||||
|
- path: agent-quotas-patch.yaml
|
||||||
|
# Allow local registry images
|
||||||
|
- path: admission-policy-patch.yaml
|
||||||
|
|
||||||
# ConfigMaps for service configs
|
# ConfigMaps for service configs
|
||||||
configMapGenerator:
|
configMapGenerator:
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
|
|
||||||
namespace: default
|
# Note: namespaces are defined in base; workloads go to dexorder-system
|
||||||
|
namespace: dexorder-system
|
||||||
|
|
||||||
# Base resources (backend, web, ingress, init/gVisor)
|
# Base resources (includes all security policies)
|
||||||
resources:
|
resources:
|
||||||
- ../base
|
- ../base
|
||||||
|
|
||||||
@@ -38,3 +39,10 @@ images:
|
|||||||
newTag: latest
|
newTag: latest
|
||||||
- name: dexorder/ai-web
|
- name: dexorder/ai-web
|
||||||
newTag: latest
|
newTag: latest
|
||||||
|
- name: ghcr.io/dexorder/gateway
|
||||||
|
newTag: latest
|
||||||
|
- name: lifecycle-sidecar
|
||||||
|
newName: ghcr.io/dexorder/lifecycle-sidecar
|
||||||
|
newTag: latest
|
||||||
|
- name: ghcr.io/dexorder/agent
|
||||||
|
newTag: latest
|
||||||
|
|||||||
21
doc/agent_harness_flow.md
Normal file
21
doc/agent_harness_flow.md
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
┌─────────────────────────────────────────────────┐
|
||||||
|
│ Agent Harness (your servers) │
|
||||||
|
│ │
|
||||||
|
│ on_message(user_id, message): │
|
||||||
|
│ 1. Look up user's MCP endpoint from Postgres │
|
||||||
|
│ 2. mcp.call("get_context_summary") │
|
||||||
|
│ 3. mcp.call("get_conversation_history", 20) │
|
||||||
|
│ 4. Build prompt: │
|
||||||
|
│ system = BASE_PROMPT │
|
||||||
|
│ + context_summary │
|
||||||
|
│ + user_agent_prompt (from MCP) │
|
||||||
|
│ messages = history + new message │
|
||||||
|
│ 5. LLM call (your API key) │
|
||||||
|
│ 6. While LLM wants tool calls: │
|
||||||
|
│ - Platform tools → handle locally │
|
||||||
|
│ - User tools → proxy to MCP │
|
||||||
|
│ - LLM call again with results │
|
||||||
|
│ 7. mcp.call("save_message", ...) │
|
||||||
|
│ 8. Return response to user │
|
||||||
|
│ │
|
||||||
|
└─────────────────────────────────────────────────┘
|
||||||
@@ -1,9 +1,11 @@
|
|||||||
Generally use skills instead of subagents, except for the analysis subagent.
|
Generally use skills instead of subagents, except for the analysis subagent.
|
||||||
|
|
||||||
## User-specific files
|
## User-specific files and tools
|
||||||
* Indicators
|
* Indicators
|
||||||
* Strategies
|
* Strategies
|
||||||
* Watchlists
|
* Watchlists
|
||||||
* Preferences
|
* Preferences
|
||||||
* Trading style
|
* Trading style
|
||||||
* Charting / colors
|
* Charting / colors
|
||||||
|
* Executors (really just sub-strategies)
|
||||||
|
* tactical-level order generators e.g. TWAP, iceberg, etc.
|
||||||
|
|||||||
@@ -1,18 +0,0 @@
|
|||||||
This file describes all the configuration options used by all components. All configuration is divided into regular config and secrets, and k8s will mount either or both as a yaml file accessible to the process.
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
|
|
||||||
* `flink_hostname`
|
|
||||||
* ... various zmq ports for flink ...
|
|
||||||
* `iceberg_catalog_hostname`
|
|
||||||
* `iceberg_catalog_port`
|
|
||||||
* `iceberg_catalog_database`
|
|
||||||
* etc
|
|
||||||
|
|
||||||
|
|
||||||
# Secrets
|
|
||||||
|
|
||||||
* `iceberg_catalog_username`
|
|
||||||
* `iceberg_catalog_password`
|
|
||||||
* etc.
|
|
||||||
|
|
||||||
313
doc/container_lifecycle_management.md
Normal file
313
doc/container_lifecycle_management.md
Normal file
@@ -0,0 +1,313 @@
|
|||||||
|
# Container Lifecycle Management
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
User agent containers self-manage their lifecycle to optimize resource usage. Containers automatically shut down when idle (no triggers + no recent activity) and clean themselves up using a lifecycle sidecar.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────────────────────────────────────────────────┐
|
||||||
|
│ Agent Pod │
|
||||||
|
│ ┌───────────────────┐ ┌──────────────────────┐ │
|
||||||
|
│ │ Agent Container │ │ Lifecycle Sidecar │ │
|
||||||
|
│ │ ─────────────── │ │ ────────────────── │ │
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ Lifecycle Manager │ │ Watches exit code │ │
|
||||||
|
│ │ - Track activity │ │ - Detects exit 42 │ │
|
||||||
|
│ │ - Track triggers │ │ - Calls k8s API │ │
|
||||||
|
│ │ - Exit 42 if idle │ │ - Deletes deployment │ │
|
||||||
|
│ └───────────────────┘ └──────────────────────┘ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ writes exit_code │ │
|
||||||
|
│ └────►/var/run/agent/exit_code │
|
||||||
|
│ │ │
|
||||||
|
└───────────────────────────────────────┼──────────────────┘
|
||||||
|
│
|
||||||
|
▼ k8s API (RBAC)
|
||||||
|
┌─────────────────────┐
|
||||||
|
│ Delete Deployment │
|
||||||
|
│ Delete PVC (if anon)│
|
||||||
|
└─────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### 1. Lifecycle Manager (Python)
|
||||||
|
|
||||||
|
**Location**: `client-py/dexorder/lifecycle_manager.py`
|
||||||
|
|
||||||
|
Runs inside the agent container and tracks:
|
||||||
|
- **Activity**: MCP tool/resource/prompt calls reset the idle timer
|
||||||
|
- **Triggers**: Data subscriptions, CEP patterns, etc.
|
||||||
|
- **Idle state**: No triggers + idle timeout exceeded
|
||||||
|
|
||||||
|
**Configuration** (via environment variables):
|
||||||
|
- `IDLE_TIMEOUT_MINUTES`: Minutes before shutdown (default: 15)
|
||||||
|
- `IDLE_CHECK_INTERVAL_SECONDS`: Check frequency (default: 60)
|
||||||
|
- `ENABLE_IDLE_SHUTDOWN`: Enable/disable shutdown (default: true)
|
||||||
|
|
||||||
|
**Usage in agent code**:
|
||||||
|
```python
|
||||||
|
from dexorder.lifecycle_manager import get_lifecycle_manager
|
||||||
|
|
||||||
|
# On startup
|
||||||
|
manager = get_lifecycle_manager()
|
||||||
|
await manager.start()
|
||||||
|
|
||||||
|
# On MCP calls (tool/resource/prompt)
|
||||||
|
manager.record_activity()
|
||||||
|
|
||||||
|
# When triggers change
|
||||||
|
manager.add_trigger("data_sub_BTC_USDT")
|
||||||
|
manager.remove_trigger("data_sub_BTC_USDT")
|
||||||
|
|
||||||
|
# Or batch update
|
||||||
|
manager.update_triggers({"trigger_1", "trigger_2"})
|
||||||
|
```
|
||||||
|
|
||||||
|
**Exit behavior**:
|
||||||
|
- Idle shutdown: Exit with code `42`
|
||||||
|
- Signal (SIGTERM/SIGINT): Exit with code `0` (allows restart)
|
||||||
|
- Errors/crashes: Exit with error code (allows restart)
|
||||||
|
|
||||||
|
### 2. Lifecycle Sidecar (Go)
|
||||||
|
|
||||||
|
**Location**: `lifecycle-sidecar/`
|
||||||
|
|
||||||
|
Runs alongside the agent container with shared PID namespace. Monitors the main container process and:
|
||||||
|
- On exit code `42`: Deletes deployment (and PVC if anonymous user)
|
||||||
|
- On any other exit code: Exits with same code (k8s restarts pod)
|
||||||
|
|
||||||
|
**Configuration** (via environment, injected by downward API):
|
||||||
|
- `NAMESPACE`: Pod's namespace
|
||||||
|
- `DEPLOYMENT_NAME`: Deployment name (from pod label)
|
||||||
|
- `USER_TYPE`: License tier (`anonymous`, `free`, `paid`, `enterprise`)
|
||||||
|
- `MAIN_CONTAINER_PID`: PID of main container (default: 1)
|
||||||
|
|
||||||
|
**RBAC**: Has permission to delete deployments and PVCs **only in dexorder-agents namespace**. Cannot delete other deployments due to:
|
||||||
|
1. Only knows its own deployment name (from env)
|
||||||
|
2. RBAC scoped to namespace
|
||||||
|
3. No cross-pod communication
|
||||||
|
|
||||||
|
### 3. Gateway (TypeScript)
|
||||||
|
|
||||||
|
**Location**: `gateway/src/harness/agent-harness.ts`
|
||||||
|
|
||||||
|
Creates agent deployments when users connect. Has permissions to:
|
||||||
|
- ✅ Create deployments, services, PVCs
|
||||||
|
- ✅ Read pod status and logs
|
||||||
|
- ✅ Update deployments (e.g., resource limits)
|
||||||
|
- ❌ Delete deployments (handled by sidecar)
|
||||||
|
- ❌ Exec into pods
|
||||||
|
- ❌ Access secrets
|
||||||
|
|
||||||
|
## Lifecycle States
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────┐
|
||||||
|
│ CREATED │ ← Gateway creates deployment
|
||||||
|
└──────┬──────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────┐
|
||||||
|
│ RUNNING │ ← User interacts, has triggers
|
||||||
|
└──────┬──────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────┐
|
||||||
|
│ IDLE │ ← No triggers + timeout exceeded
|
||||||
|
└──────┬──────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────┐
|
||||||
|
│ SHUTDOWN │ ← Exit code 42
|
||||||
|
└──────┬──────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────┐
|
||||||
|
│ DELETED │ ← Sidecar deletes deployment
|
||||||
|
└─────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Idle Detection Logic
|
||||||
|
|
||||||
|
Container is **IDLE** when:
|
||||||
|
1. `active_triggers.isEmpty()` AND
|
||||||
|
2. `(now - last_activity) > idle_timeout`
|
||||||
|
|
||||||
|
Container is **ACTIVE** when:
|
||||||
|
1. Has any active triggers (data subscriptions, CEP patterns, etc.) OR
|
||||||
|
2. Recent user activity (MCP calls within timeout)
|
||||||
|
|
||||||
|
## Cleanup Policies by License Tier
|
||||||
|
|
||||||
|
| User Type | Idle Timeout | PVC Policy | Notes |
|
||||||
|
|--------------|--------------|------------|-------|
|
||||||
|
| Anonymous | 15 minutes | Delete | Ephemeral, no data retention |
|
||||||
|
| Free | 15 minutes | Retain | Can resume session |
|
||||||
|
| Paid | 60 minutes | Retain | Longer grace period |
|
||||||
|
| Enterprise | No shutdown | Retain | Always-on containers |
|
||||||
|
|
||||||
|
Configured via `USER_TYPE` env var in deployment.
|
||||||
|
|
||||||
|
## Security
|
||||||
|
|
||||||
|
### Principle of Least Privilege
|
||||||
|
|
||||||
|
**Gateway**:
|
||||||
|
- Can create agent resources
|
||||||
|
- Cannot delete agent resources
|
||||||
|
- Cannot access other namespaces
|
||||||
|
- Cannot exec into pods
|
||||||
|
|
||||||
|
**Lifecycle Sidecar**:
|
||||||
|
- Can delete its own deployment only
|
||||||
|
- Cannot delete other deployments
|
||||||
|
- Scoped to dexorder-agents namespace
|
||||||
|
- No exec, no secrets access
|
||||||
|
|
||||||
|
### Admission Control
|
||||||
|
|
||||||
|
All deployments in `dexorder-agents` namespace are subject to:
|
||||||
|
- Image allowlist (only approved images)
|
||||||
|
- Security context enforcement (non-root, drop caps, read-only rootfs)
|
||||||
|
- Resource limits required
|
||||||
|
- PodSecurity standards (restricted profile)
|
||||||
|
|
||||||
|
See `deploy/k8s/base/admission-policy.yaml`
|
||||||
|
|
||||||
|
### Network Isolation
|
||||||
|
|
||||||
|
Agents are network-isolated via NetworkPolicy:
|
||||||
|
- Can connect to gateway (MCP)
|
||||||
|
- Can connect to Redpanda (data streams)
|
||||||
|
- Can make outbound HTTPS (exchanges, LLM APIs)
|
||||||
|
- Cannot access k8s API
|
||||||
|
- Cannot access system namespace
|
||||||
|
- Cannot access other agent pods
|
||||||
|
|
||||||
|
See `deploy/k8s/base/network-policies.yaml`
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
|
||||||
|
### 1. Apply Security Policies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kubectl apply -k deploy/k8s/dev # or prod
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates:
|
||||||
|
- Namespaces (`dexorder-system`, `dexorder-agents`)
|
||||||
|
- RBAC (gateway, lifecycle sidecar)
|
||||||
|
- Admission policies
|
||||||
|
- Network policies
|
||||||
|
- Resource quotas
|
||||||
|
|
||||||
|
### 2. Build and Push Lifecycle Sidecar
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd lifecycle-sidecar
|
||||||
|
docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
|
||||||
|
docker push ghcr.io/dexorder/lifecycle-sidecar:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Gateway Creates Agent Deployments
|
||||||
|
|
||||||
|
When a user connects, the gateway creates:
|
||||||
|
- Deployment with agent + sidecar
|
||||||
|
- PVC for persistent data
|
||||||
|
- Service for MCP endpoint
|
||||||
|
|
||||||
|
See `deploy/k8s/base/agent-deployment-example.yaml` for template.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Test Lifecycle Manager Locally
|
||||||
|
|
||||||
|
```python
|
||||||
|
from dexorder.lifecycle_manager import LifecycleManager
|
||||||
|
|
||||||
|
# Disable actual shutdown for testing
|
||||||
|
manager = LifecycleManager(
|
||||||
|
idle_timeout_minutes=1,
|
||||||
|
check_interval_seconds=10,
|
||||||
|
enable_shutdown=False # Only log, don't exit
|
||||||
|
)
|
||||||
|
|
||||||
|
await manager.start()
|
||||||
|
|
||||||
|
# Simulate activity
|
||||||
|
manager.record_activity()
|
||||||
|
|
||||||
|
# Simulate triggers
|
||||||
|
manager.add_trigger("test_trigger")
|
||||||
|
await asyncio.sleep(70) # Wait past timeout
|
||||||
|
manager.remove_trigger("test_trigger")
|
||||||
|
await asyncio.sleep(70) # Should detect idle
|
||||||
|
|
||||||
|
await manager.stop()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test Sidecar Locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build
|
||||||
|
cd lifecycle-sidecar
|
||||||
|
go build -o lifecycle-sidecar main.go
|
||||||
|
|
||||||
|
# Run (requires k8s config)
|
||||||
|
export NAMESPACE=dexorder-agents
|
||||||
|
export DEPLOYMENT_NAME=agent-test
|
||||||
|
export USER_TYPE=free
|
||||||
|
./lifecycle-sidecar
|
||||||
|
```
|
||||||
|
|
||||||
|
### Integration Test
|
||||||
|
|
||||||
|
1. Deploy test agent with sidecar
|
||||||
|
2. Verify agent starts and is healthy
|
||||||
|
3. Stop sending MCP calls and remove all triggers
|
||||||
|
4. Wait for idle timeout + check interval
|
||||||
|
5. Verify deployment is deleted
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Container not shutting down when idle
|
||||||
|
|
||||||
|
Check logs:
|
||||||
|
```bash
|
||||||
|
kubectl logs -n dexorder-agents agent-user-abc123 -c agent
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify:
|
||||||
|
- `ENABLE_IDLE_SHUTDOWN=true`
|
||||||
|
- No active triggers: `manager.active_triggers` should be empty
|
||||||
|
- Idle timeout exceeded
|
||||||
|
|
||||||
|
### Sidecar not deleting deployment
|
||||||
|
|
||||||
|
Check sidecar logs:
|
||||||
|
```bash
|
||||||
|
kubectl logs -n dexorder-agents agent-user-abc123 -c lifecycle-sidecar
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify:
|
||||||
|
- Exit code file exists: `/var/run/agent/exit_code` contains `42`
|
||||||
|
- RBAC permissions: `kubectl auth can-i delete deployments --as=system:serviceaccount:dexorder-agents:agent-lifecycle -n dexorder-agents`
|
||||||
|
- Deployment name matches: Check `DEPLOYMENT_NAME` env var
|
||||||
|
|
||||||
|
### Gateway can't create deployments
|
||||||
|
|
||||||
|
Check gateway logs and verify:
|
||||||
|
- ServiceAccount exists: `kubectl get sa gateway -n dexorder-system`
|
||||||
|
- RoleBinding exists: `kubectl get rolebinding gateway-agent-creator -n dexorder-agents`
|
||||||
|
- Admission policy allows image: Check image name matches allowlist in `admission-policy.yaml`
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
1. **Graceful shutdown notifications**: Warn users before shutdown via websocket
|
||||||
|
2. **Predictive scaling**: Keep frequently-used containers warm
|
||||||
|
3. **Tiered storage**: Move old PVCs to cheaper storage class
|
||||||
|
4. **Metrics**: Expose lifecycle metrics (idle rate, shutdown count, etc.)
|
||||||
|
5. **Cost allocation**: Track resource usage per user/license tier
|
||||||
286
doc/gateway_container_creation.md
Normal file
286
doc/gateway_container_creation.md
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
# Gateway Container Creation
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The gateway automatically provisions user agent containers when users authenticate. This ensures each user has their own isolated environment running their MCP server with persistent storage.
|
||||||
|
|
||||||
|
## Authentication Flow with Container Creation
|
||||||
|
|
||||||
|
```
|
||||||
|
User connects (WebSocket/Telegram)
|
||||||
|
↓
|
||||||
|
Send "Authenticating..." status
|
||||||
|
↓
|
||||||
|
Verify token/channel link
|
||||||
|
↓
|
||||||
|
Lookup user license from DB
|
||||||
|
↓
|
||||||
|
Send "Starting workspace..." status
|
||||||
|
↓
|
||||||
|
┌────────────────────────────────────┐
|
||||||
|
│ ContainerManager.ensureRunning() │
|
||||||
|
│ ┌──────────────────────────────┐ │
|
||||||
|
│ │ Check if deployment exists │ │
|
||||||
|
│ └──────────────────────────────┘ │
|
||||||
|
│ ↓ │
|
||||||
|
│ Does it exist? │
|
||||||
|
│ ↙ ↘ │
|
||||||
|
│ Yes No │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ ┌──────────────────┐ │
|
||||||
|
│ │ │ Create deployment│ │
|
||||||
|
│ │ │ Create PVC │ │
|
||||||
|
│ │ │ Create service │ │
|
||||||
|
│ │ └──────────────────┘ │
|
||||||
|
│ │ │ │
|
||||||
|
│ └────────────┘ │
|
||||||
|
│ ↓ │
|
||||||
|
│ Wait for deployment ready │
|
||||||
|
│ (polls every 2s, timeout 2min) │
|
||||||
|
│ ↓ │
|
||||||
|
│ Compute MCP endpoint URL │
|
||||||
|
│ (internal k8s service DNS) │
|
||||||
|
└────────────────────────────────────┘
|
||||||
|
↓
|
||||||
|
Update license.mcpServerUrl
|
||||||
|
↓
|
||||||
|
Send "Connected" status
|
||||||
|
↓
|
||||||
|
Initialize AgentHarness
|
||||||
|
↓
|
||||||
|
Connect to user's MCP server
|
||||||
|
↓
|
||||||
|
Ready for messages
|
||||||
|
```
|
||||||
|
|
||||||
|
## Container Naming Convention
|
||||||
|
|
||||||
|
All resources follow a consistent naming pattern based on `userId`:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
userId: "user-abc123"
|
||||||
|
↓
|
||||||
|
deploymentName: "agent-user-abc123"
|
||||||
|
serviceName: "agent-user-abc123"
|
||||||
|
pvcName: "agent-user-abc123-data"
|
||||||
|
mcpEndpoint: "http://agent-user-abc123.dexorder-agents.svc.cluster.local:3000"
|
||||||
|
```
|
||||||
|
|
||||||
|
User IDs are sanitized to be Kubernetes-compliant (lowercase alphanumeric + hyphens).
|
||||||
|
|
||||||
|
## Templates by License Tier
|
||||||
|
|
||||||
|
Templates are located in `gateway/src/k8s/templates/`:
|
||||||
|
- `free-tier.yaml`
|
||||||
|
- `pro-tier.yaml`
|
||||||
|
- `enterprise-tier.yaml`
|
||||||
|
|
||||||
|
### Variable Substitution
|
||||||
|
|
||||||
|
Templates use simple string replacement:
|
||||||
|
- `{{userId}}` - User ID
|
||||||
|
- `{{deploymentName}}` - Computed deployment name
|
||||||
|
- `{{serviceName}}` - Computed service name
|
||||||
|
- `{{pvcName}}` - Computed PVC name
|
||||||
|
- `{{agentImage}}` - Agent container image (from env)
|
||||||
|
- `{{sidecarImage}}` - Lifecycle sidecar image (from env)
|
||||||
|
- `{{storageClass}}` - Kubernetes storage class (from env)
|
||||||
|
|
||||||
|
### Resource Limits
|
||||||
|
|
||||||
|
| Tier | Memory Request | Memory Limit | CPU Request | CPU Limit | Storage | Idle Timeout |
|
||||||
|
|------|----------------|--------------|-------------|-----------|---------|--------------|
|
||||||
|
| **Free** | 256Mi | 512Mi | 100m | 500m | 1Gi | 15min |
|
||||||
|
| **Pro** | 512Mi | 2Gi | 250m | 2000m | 10Gi | 60min |
|
||||||
|
| **Enterprise** | 1Gi | 4Gi | 500m | 4000m | 50Gi | Never (shutdown disabled) |
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### KubernetesClient (`gateway/src/k8s/client.ts`)
|
||||||
|
|
||||||
|
Low-level k8s API wrapper:
|
||||||
|
- `deploymentExists(name)` - Check if deployment exists
|
||||||
|
- `createAgentDeployment(spec)` - Create deployment/service/PVC from template
|
||||||
|
- `waitForDeploymentReady(name, timeout)` - Poll until ready
|
||||||
|
- `getServiceEndpoint(name)` - Get service URL
|
||||||
|
- `deleteAgentDeployment(userId)` - Cleanup (for testing)
|
||||||
|
|
||||||
|
Static helpers:
|
||||||
|
- `getDeploymentName(userId)` - Generate deployment name
|
||||||
|
- `getServiceName(userId)` - Generate service name
|
||||||
|
- `getPvcName(userId)` - Generate PVC name
|
||||||
|
- `getMcpEndpoint(userId, namespace)` - Compute internal service URL
|
||||||
|
|
||||||
|
### ContainerManager (`gateway/src/k8s/container-manager.ts`)
|
||||||
|
|
||||||
|
High-level orchestration:
|
||||||
|
- `ensureContainerRunning(userId, license)` - Main entry point
|
||||||
|
- Returns: `{ mcpEndpoint, wasCreated }`
|
||||||
|
- Creates deployment if missing
|
||||||
|
- Waits for ready state
|
||||||
|
- Returns endpoint URL
|
||||||
|
- `getContainerStatus(userId)` - Check status without creating
|
||||||
|
- `deleteContainer(userId)` - Manual cleanup
|
||||||
|
|
||||||
|
### Authenticator (`gateway/src/auth/authenticator.ts`)
|
||||||
|
|
||||||
|
Updated to call container manager:
|
||||||
|
- `authenticateWebSocket()` - Calls `ensureContainerRunning()` before returning `AuthContext`
|
||||||
|
- `authenticateTelegram()` - Same for Telegram webhooks
|
||||||
|
|
||||||
|
### WebSocketHandler (`gateway/src/channels/websocket-handler.ts`)
|
||||||
|
|
||||||
|
Multi-phase connection protocol:
|
||||||
|
1. Send `{type: 'status', status: 'authenticating'}`
|
||||||
|
2. Authenticate (may take 30-120s if creating container)
|
||||||
|
3. Send `{type: 'status', status: 'initializing'}`
|
||||||
|
4. Initialize agent harness
|
||||||
|
5. Send `{type: 'connected', ...}`
|
||||||
|
|
||||||
|
This gives the client visibility into the startup process.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Environment variables:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Kubernetes
|
||||||
|
KUBERNETES_NAMESPACE=dexorder-agents
|
||||||
|
KUBERNETES_IN_CLUSTER=true # false for local dev
|
||||||
|
KUBERNETES_CONTEXT=minikube # for local dev only
|
||||||
|
|
||||||
|
# Container images
|
||||||
|
AGENT_IMAGE=ghcr.io/dexorder/agent:latest
|
||||||
|
SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
|
||||||
|
|
||||||
|
# Storage
|
||||||
|
AGENT_STORAGE_CLASS=standard
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security
|
||||||
|
|
||||||
|
The gateway uses a restricted ServiceAccount with RBAC:
|
||||||
|
|
||||||
|
**Can do:**
|
||||||
|
- ✅ Create deployments in `dexorder-agents` namespace
|
||||||
|
- ✅ Create services in `dexorder-agents` namespace
|
||||||
|
- ✅ Create PVCs in `dexorder-agents` namespace
|
||||||
|
- ✅ Read pod status and logs (debugging)
|
||||||
|
- ✅ Update deployments (future: resource scaling)
|
||||||
|
|
||||||
|
**Cannot do:**
|
||||||
|
- ❌ Delete deployments (handled by lifecycle sidecar)
|
||||||
|
- ❌ Delete PVCs (handled by lifecycle sidecar)
|
||||||
|
- ❌ Exec into pods
|
||||||
|
- ❌ Access secrets or configmaps
|
||||||
|
- ❌ Create resources in other namespaces
|
||||||
|
- ❌ Access Kubernetes API from agent containers (blocked by NetworkPolicy)
|
||||||
|
|
||||||
|
See `deploy/k8s/base/gateway-rbac.yaml` for full configuration.
|
||||||
|
|
||||||
|
## Lifecycle
|
||||||
|
|
||||||
|
### Container Creation (Gateway)
|
||||||
|
- User authenticates
|
||||||
|
- Gateway checks if deployment exists
|
||||||
|
- If missing, creates from template
|
||||||
|
- Waits for ready (2min timeout)
|
||||||
|
- Returns MCP endpoint
|
||||||
|
|
||||||
|
### Container Deletion (Lifecycle Sidecar)
|
||||||
|
- Container tracks activity and triggers
|
||||||
|
- When idle (no triggers + timeout), exits with code 42
|
||||||
|
- Sidecar detects exit code 42
|
||||||
|
- Sidecar deletes deployment + optional PVC via k8s API
|
||||||
|
- Gateway creates fresh container on next authentication
|
||||||
|
|
||||||
|
See `doc/container_lifecycle_management.md` for full lifecycle details.
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
| Error | Gateway Action | User Experience |
|
||||||
|
|-------|----------------|-----------------|
|
||||||
|
| Deployment creation fails | Log error, return auth failure | "Authentication failed" |
|
||||||
|
| Wait timeout (image pull, etc.) | Log warning, return 503 | "Service unavailable, retry" |
|
||||||
|
| Service not found | Retry with backoff | Transparent retry |
|
||||||
|
| MCP connection fails | Return error | "Failed to connect to workspace" |
|
||||||
|
| Existing deployment not ready | Wait 30s, continue if still not ready | May connect to partially-ready container |
|
||||||
|
|
||||||
|
## Local Development
|
||||||
|
|
||||||
|
For local development (outside k8s):
|
||||||
|
|
||||||
|
1. Start minikube:
|
||||||
|
```bash
|
||||||
|
minikube start
|
||||||
|
minikube addons enable storage-provisioner
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Apply security policies:
|
||||||
|
```bash
|
||||||
|
kubectl apply -k deploy/k8s/dev
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Configure gateway for local k8s:
|
||||||
|
```bash
|
||||||
|
# .env
|
||||||
|
KUBERNETES_IN_CLUSTER=false
|
||||||
|
KUBERNETES_CONTEXT=minikube
|
||||||
|
KUBERNETES_NAMESPACE=dexorder-agents
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Run gateway:
|
||||||
|
```bash
|
||||||
|
cd gateway
|
||||||
|
npm run dev
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Connect via WebSocket:
|
||||||
|
```bash
|
||||||
|
wscat -c "ws://localhost:3000/ws/chat" -H "Authorization: Bearer your-jwt"
|
||||||
|
```
|
||||||
|
|
||||||
|
The gateway will create deployments in minikube. View with:
|
||||||
|
```bash
|
||||||
|
kubectl get deployments -n dexorder-agents
|
||||||
|
kubectl get pods -n dexorder-agents
|
||||||
|
kubectl logs -n dexorder-agents agent-user-abc123 -c agent
|
||||||
|
```
|
||||||
|
|
||||||
|
## Production Deployment
|
||||||
|
|
||||||
|
1. Build and push gateway image:
|
||||||
|
```bash
|
||||||
|
cd gateway
|
||||||
|
docker build -t ghcr.io/dexorder/gateway:latest .
|
||||||
|
docker push ghcr.io/dexorder/gateway:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Deploy to k8s:
|
||||||
|
```bash
|
||||||
|
kubectl apply -k deploy/k8s/prod
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Gateway runs in `dexorder-system` namespace
|
||||||
|
4. Creates agent containers in `dexorder-agents` namespace
|
||||||
|
5. Admission policies enforce image allowlist and security constraints
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
Useful metrics to track:
|
||||||
|
- Container creation latency (time from auth to ready)
|
||||||
|
- Container creation failure rate
|
||||||
|
- Active containers by license tier
|
||||||
|
- Resource usage per tier
|
||||||
|
- Idle shutdown rate
|
||||||
|
|
||||||
|
These can be exported via Prometheus or logged to monitoring service.
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
1. **Pre-warming**: Create containers for active users before they connect
|
||||||
|
2. **Image updates**: Handle agent image version migrations with user consent
|
||||||
|
3. **Multi-region**: Geo-distributed container placement
|
||||||
|
4. **Cost tracking**: Per-user resource usage and billing
|
||||||
|
5. **Auto-scaling**: Scale down to 0 replicas instead of deletion (faster restart)
|
||||||
|
6. **Container pools**: Shared warm containers for anonymous users
|
||||||
80
doc/m_c_p_client_authentication_modes.md
Normal file
80
doc/m_c_p_client_authentication_modes.md
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
Mode A: Platform Harness → Hosted Container (internal)
|
||||||
|
Auth: mTLS + platform-signed user claim
|
||||||
|
Network: k8s internal, never hits the internet
|
||||||
|
|
||||||
|
Mode B: Platform Harness → External User Container (remote)
|
||||||
|
Auth: OAuth2 token issued by your platform
|
||||||
|
Network: public internet, TLS required
|
||||||
|
|
||||||
|
Mode C: Third-party MCP Client → External User Container (standalone)
|
||||||
|
Auth: User-managed API key or local-only (no network)
|
||||||
|
Network: localhost or user's own network
|
||||||
|
|
||||||
|
┌──────────────────────────────────────────────────────────┐
|
||||||
|
│ Platform (Postgres) │
|
||||||
|
│ │
|
||||||
|
│ users │
|
||||||
|
│ ├── id, email, password_hash, plan_tier │
|
||||||
|
│ │ │
|
||||||
|
│ containers │
|
||||||
|
│ ├── user_id │
|
||||||
|
│ ├── type: "hosted" | "external" │
|
||||||
|
│ ├── mcp_endpoint: "internal-svc:3100" | "https://..." │
|
||||||
|
│ ├── auth_method: "mtls" | "platform_token" | "api_key" │
|
||||||
|
│ └── public_key_fingerprint (for pinning external certs) │
|
||||||
|
│ │
|
||||||
|
│ api_tokens │
|
||||||
|
│ ├── user_id │
|
||||||
|
│ ├── token_hash │
|
||||||
|
│ ├── scopes: ["mcp:tools", "mcp:resources", "data:read"] │
|
||||||
|
│ ├── expires_at │
|
||||||
|
│ └── issued_for: "platform_harness" | "user_direct" │
|
||||||
|
│ │
|
||||||
|
└──────────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
## Mode A
|
||||||
|
|
||||||
|
Harness ──mTLS──▶ k8s Service ──▶ User Container MCP
|
||||||
|
Validates: source is platform namespace
|
||||||
|
Extracts: user_id from forwarded header
|
||||||
|
|
||||||
|
## Mode B
|
||||||
|
|
||||||
|
Registration flow (one-time):
|
||||||
|
1. User provides their MCP endpoint URL in platform settings
|
||||||
|
2. Platform generates a scoped token (JWT, short-lived, auto-refreshed)
|
||||||
|
3. User configures their MCP server to accept tokens signed by your platform
|
||||||
|
4. Platform stores the endpoint + auth method
|
||||||
|
|
||||||
|
Runtime:
|
||||||
|
┌──────────┐ HTTPS + Bearer token ┌────────────────────┐
|
||||||
|
│ Harness │ ─────────────────────────▶ │ External MCP Server│
|
||||||
|
│ │ Authorization: │ │
|
||||||
|
│ │ Bearer <platform_jwt> │ Validates: │
|
||||||
|
│ │ │ - JWT signature │
|
||||||
|
│ │ │ (your public │
|
||||||
|
│ │ │ key, JWKS) │
|
||||||
|
│ │ │ - user_id claim │
|
||||||
|
│ │ │ matches self │
|
||||||
|
│ │ │ - not expired │
|
||||||
|
└──────────┘ └────────────────────┘
|
||||||
|
|
||||||
|
## Mode C
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# openclaw/config.yaml
|
||||||
|
auth:
|
||||||
|
# For local-only use (Claude Desktop, Cursor, etc via stdio)
|
||||||
|
mode: "local" # no network auth needed
|
||||||
|
|
||||||
|
# OR for remote access
|
||||||
|
mode: "token"
|
||||||
|
tokens:
|
||||||
|
- name: "my-laptop"
|
||||||
|
hash: "sha256:..." # generated by `openclaw token create`
|
||||||
|
|
||||||
|
# OR for platform integration
|
||||||
|
mode: "platform"
|
||||||
|
platform_jwks_url: "https://api.openclaw.io/.well-known/jwks.json"
|
||||||
|
expected_user_id: "user_abc123"
|
||||||
|
```
|
||||||
29
doc/m_c_p_tools_architecture.md
Normal file
29
doc/m_c_p_tools_architecture.md
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
MCP Tools (User Container)
|
||||||
|
├── Memory
|
||||||
|
│ ├── get_conversation_history(limit)
|
||||||
|
│ ├── save_message(role, content)
|
||||||
|
│ ├── search_memory(query) ← semantic search over past conversations
|
||||||
|
│ └── get_context_summary() ← "who is this user, what do they care about"
|
||||||
|
│
|
||||||
|
├── Strategies & Indicators
|
||||||
|
│ ├── list_strategies()
|
||||||
|
│ ├── read_strategy(name)
|
||||||
|
│ ├── write_strategy(name, code)
|
||||||
|
│ ├── list_indicators()
|
||||||
|
│ ├── read_indicator(name)
|
||||||
|
│ ├── write_indicator(name, code)
|
||||||
|
│ └── run_backtest(strategy, params)
|
||||||
|
│
|
||||||
|
├── Preferences
|
||||||
|
│ ├── get_preferences()
|
||||||
|
│ ├── set_preference(key, value)
|
||||||
|
│ └── get_agent_prompt() ← user's custom system prompt additions
|
||||||
|
│
|
||||||
|
├── Trading
|
||||||
|
│ ├── get_watchlist()
|
||||||
|
│ ├── execute_trade(params)
|
||||||
|
│ ├── get_positions()
|
||||||
|
│ └── get_trade_history()
|
||||||
|
│
|
||||||
|
└── Sandbox
|
||||||
|
└── run_python(code) ← datascience toolset, matplotlib, etc.
|
||||||
472
doc/user_mcp_resources.md
Normal file
472
doc/user_mcp_resources.md
Normal file
@@ -0,0 +1,472 @@
|
|||||||
|
# User MCP Server - Resource Architecture
|
||||||
|
|
||||||
|
The user's MCP server container owns **all** conversation history, RAG, and contextual data. The platform gateway is a thin, stateless orchestrator that only holds the Anthropic API key.
|
||||||
|
|
||||||
|
## Architecture Principle
|
||||||
|
|
||||||
|
**User Container = Fat Context**
|
||||||
|
- Conversation history (PostgreSQL/SQLite)
|
||||||
|
- RAG system (embeddings, vector search)
|
||||||
|
- User preferences and custom prompts
|
||||||
|
- Trading context (positions, watchlists, alerts)
|
||||||
|
- All user-specific data
|
||||||
|
|
||||||
|
**Platform Gateway = Thin Orchestrator**
|
||||||
|
- Anthropic API key (platform pays for LLM)
|
||||||
|
- Session management (WebSocket/Telegram connections)
|
||||||
|
- MCP client connection pooling
|
||||||
|
- Tool routing (platform vs user tools)
|
||||||
|
- **Zero conversation state stored**
|
||||||
|
|
||||||
|
## MCP Resources for Context Injection
|
||||||
|
|
||||||
|
Resources are **read-only** data sources that provide context to the LLM. They're fetched before each Claude API call and embedded in the conversation.
|
||||||
|
|
||||||
|
### Standard Context Resources
|
||||||
|
|
||||||
|
#### 1. `context://user-profile`
|
||||||
|
**Purpose:** User's trading background and preferences
|
||||||
|
|
||||||
|
**MIME Type:** `text/plain`
|
||||||
|
|
||||||
|
**Example Content:**
|
||||||
|
```
|
||||||
|
User Profile:
|
||||||
|
- Trading experience: Intermediate
|
||||||
|
- Preferred timeframes: 1h, 4h, 1d
|
||||||
|
- Risk tolerance: Medium
|
||||||
|
- Focus: Swing trading with technical indicators
|
||||||
|
- Favorite indicators: RSI, MACD, Bollinger Bands
|
||||||
|
- Active pairs: BTC/USDT, ETH/USDT, SOL/USDT
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation Notes:**
|
||||||
|
- Stored in user's database `user_preferences` table
|
||||||
|
- Updated via preference management tools
|
||||||
|
- Includes inferred data from usage patterns
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 2. `context://conversation-summary`
|
||||||
|
**Purpose:** Semantic summary of recent conversation with RAG-enhanced context
|
||||||
|
|
||||||
|
**MIME Type:** `text/plain`
|
||||||
|
|
||||||
|
**Example Content:**
|
||||||
|
```
|
||||||
|
Recent Conversation Summary:
|
||||||
|
|
||||||
|
Last 10 messages (summarized):
|
||||||
|
- User asked about moving average crossover strategies
|
||||||
|
- Discussed backtesting parameters for BTC/USDT
|
||||||
|
- Reviewed risk management with 2% position sizing
|
||||||
|
- Explored adding RSI filter to reduce false signals
|
||||||
|
|
||||||
|
Relevant past discussions (RAG search):
|
||||||
|
- 2 weeks ago: Similar strategy development on ETH/USDT
|
||||||
|
- 1 month ago: User prefers simple strategies over complex ones
|
||||||
|
- Past preference: Avoid strategies with >5 indicators
|
||||||
|
|
||||||
|
Current focus: Optimizing MA crossover with momentum filter
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation Notes:**
|
||||||
|
- Last N messages stored in `conversation_history` table
|
||||||
|
- RAG search against embeddings of past conversations
|
||||||
|
- Semantic search using user's current message as query
|
||||||
|
- ChromaDB/pgvector for embedding storage
|
||||||
|
- Summary generated on-demand (can be cached for 1-5 minutes)
|
||||||
|
|
||||||
|
**RAG Integration:**
|
||||||
|
```python
|
||||||
|
async def get_conversation_summary() -> str:
|
||||||
|
# Get recent messages
|
||||||
|
recent = await db.get_recent_messages(limit=50)
|
||||||
|
|
||||||
|
# Semantic search for relevant context
|
||||||
|
relevant = await rag.search_conversation_history(
|
||||||
|
query=recent[-1].content, # Last user message
|
||||||
|
limit=5,
|
||||||
|
min_score=0.7
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build summary
|
||||||
|
return build_summary(recent[-10:], relevant)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 3. `context://workspace-state`
|
||||||
|
**Purpose:** Current trading workspace (chart, positions, watchlist)
|
||||||
|
|
||||||
|
**MIME Type:** `application/json`
|
||||||
|
|
||||||
|
**Example Content:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"currentChart": {
|
||||||
|
"ticker": "BINANCE:BTC/USDT",
|
||||||
|
"timeframe": "1h",
|
||||||
|
"indicators": ["SMA(20)", "RSI(14)", "MACD(12,26,9)"]
|
||||||
|
},
|
||||||
|
"watchlist": ["BTC/USDT", "ETH/USDT", "SOL/USDT"],
|
||||||
|
"openPositions": [
|
||||||
|
{
|
||||||
|
"ticker": "BTC/USDT",
|
||||||
|
"side": "long",
|
||||||
|
"size": 0.1,
|
||||||
|
"entryPrice": 45000,
|
||||||
|
"currentPrice": 46500,
|
||||||
|
"unrealizedPnL": 150
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"recentAlerts": [
|
||||||
|
{
|
||||||
|
"type": "price_alert",
|
||||||
|
"message": "BTC/USDT crossed above $46,000",
|
||||||
|
"timestamp": "2025-01-15T10:30:00Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation Notes:**
|
||||||
|
- Synced from web client chart state
|
||||||
|
- Updated via WebSocket sync protocol
|
||||||
|
- Includes active indicators on current chart
|
||||||
|
- Position data from trading system
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### 4. `context://system-prompt`
|
||||||
|
**Purpose:** User's custom instructions and preferences for AI behavior
|
||||||
|
|
||||||
|
**MIME Type:** `text/plain`
|
||||||
|
|
||||||
|
**Example Content:**
|
||||||
|
```
|
||||||
|
Custom Instructions:
|
||||||
|
- Be concise and data-driven
|
||||||
|
- Always show risk/reward ratios
|
||||||
|
- Prefer simple strategies over complex ones
|
||||||
|
- When suggesting trades, include stop-loss and take-profit levels
|
||||||
|
- Explain your reasoning in trading decisions
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation Notes:**
|
||||||
|
- User-editable in preferences UI
|
||||||
|
- Appended **last** to system prompt (highest priority)
|
||||||
|
- Can override platform defaults
|
||||||
|
- Stored in `user_preferences.custom_prompt` field
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## MCP Tools for Actions
|
||||||
|
|
||||||
|
Tools are for **actions** that have side effects. These are **not** used for context fetching.
|
||||||
|
|
||||||
|
### Conversation Management
|
||||||
|
- `save_message(role, content, timestamp)` - Save message to history
|
||||||
|
- `search_conversation(query, limit)` - Explicit semantic search (for user queries like "what did we discuss about BTC?")
|
||||||
|
|
||||||
|
### Strategy & Indicators
|
||||||
|
- `list_strategies()` - List user's strategies
|
||||||
|
- `read_strategy(name)` - Get strategy code
|
||||||
|
- `write_strategy(name, code)` - Save strategy
|
||||||
|
- `run_backtest(strategy, params)` - Execute backtest
|
||||||
|
|
||||||
|
### Trading
|
||||||
|
- `get_watchlist()` - Get watchlist (action that may trigger sync)
|
||||||
|
- `execute_trade(params)` - Execute trade order
|
||||||
|
- `get_positions()` - Fetch current positions from exchange
|
||||||
|
|
||||||
|
### Sandbox
|
||||||
|
- `run_python(code)` - Execute Python code with data science libraries
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Gateway Harness Flow
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// gateway/src/harness/agent-harness.ts
|
||||||
|
|
||||||
|
async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
|
||||||
|
// 1. Fetch context resources from user's MCP
|
||||||
|
const contextResources = await fetchContextResources([
|
||||||
|
'context://user-profile',
|
||||||
|
'context://conversation-summary', // <-- RAG happens here
|
||||||
|
'context://workspace-state',
|
||||||
|
'context://system-prompt',
|
||||||
|
]);
|
||||||
|
|
||||||
|
// 2. Build system prompt from resources
|
||||||
|
const systemPrompt = buildSystemPrompt(contextResources);
|
||||||
|
|
||||||
|
// 3. Build messages with embedded conversation context
|
||||||
|
const messages = buildMessages(message, contextResources);
|
||||||
|
|
||||||
|
// 4. Get tools from MCP
|
||||||
|
const tools = await mcpClient.listTools();
|
||||||
|
|
||||||
|
// 5. Call Claude with embedded context
|
||||||
|
const response = await anthropic.messages.create({
|
||||||
|
model: 'claude-3-5-sonnet-20241022',
|
||||||
|
system: systemPrompt, // <-- User profile + workspace + custom prompt
|
||||||
|
messages, // <-- Conversation summary from RAG
|
||||||
|
tools,
|
||||||
|
});
|
||||||
|
|
||||||
|
// 6. Save to user's MCP (tool call)
|
||||||
|
await mcpClient.callTool('save_message', { role: 'user', content: message.content });
|
||||||
|
await mcpClient.callTool('save_message', { role: 'assistant', content: response });
|
||||||
|
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## User MCP Server Implementation (Python)
|
||||||
|
|
||||||
|
### Resource Handler
|
||||||
|
|
||||||
|
```python
|
||||||
|
# user-mcp/src/resources.py
|
||||||
|
|
||||||
|
from mcp.server import Server
|
||||||
|
from mcp.types import Resource, ResourceTemplate
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
server = Server("dexorder-user")
|
||||||
|
|
||||||
|
@server.list_resources()
|
||||||
|
async def list_resources() -> list[Resource]:
|
||||||
|
return [
|
||||||
|
Resource(
|
||||||
|
uri="context://user-profile",
|
||||||
|
name="User Profile",
|
||||||
|
description="Trading style, preferences, and background",
|
||||||
|
mimeType="text/plain",
|
||||||
|
),
|
||||||
|
Resource(
|
||||||
|
uri="context://conversation-summary",
|
||||||
|
name="Conversation Summary",
|
||||||
|
description="Recent conversation with RAG-enhanced context",
|
||||||
|
mimeType="text/plain",
|
||||||
|
),
|
||||||
|
Resource(
|
||||||
|
uri="context://workspace-state",
|
||||||
|
name="Workspace State",
|
||||||
|
description="Current chart, watchlist, positions",
|
||||||
|
mimeType="application/json",
|
||||||
|
),
|
||||||
|
Resource(
|
||||||
|
uri="context://system-prompt",
|
||||||
|
name="Custom System Prompt",
|
||||||
|
description="User's custom AI instructions",
|
||||||
|
mimeType="text/plain",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
@server.read_resource()
|
||||||
|
async def read_resource(uri: str) -> str:
|
||||||
|
if uri == "context://user-profile":
|
||||||
|
return await build_user_profile()
|
||||||
|
elif uri == "context://conversation-summary":
|
||||||
|
return await build_conversation_summary()
|
||||||
|
elif uri == "context://workspace-state":
|
||||||
|
return await build_workspace_state()
|
||||||
|
elif uri == "context://system-prompt":
|
||||||
|
return await get_custom_prompt()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown resource: {uri}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### RAG Integration
|
||||||
|
|
||||||
|
```python
|
||||||
|
# user-mcp/src/rag.py
|
||||||
|
|
||||||
|
import chromadb
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
class ConversationRAG:
|
||||||
|
def __init__(self, db_path: str):
|
||||||
|
self.chroma = chromadb.PersistentClient(path=db_path)
|
||||||
|
self.collection = self.chroma.get_or_create_collection("conversations")
|
||||||
|
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
|
|
||||||
|
async def search_conversation_history(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
limit: int = 5,
|
||||||
|
min_score: float = 0.7
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Semantic search over conversation history"""
|
||||||
|
# Embed query
|
||||||
|
query_embedding = self.embedder.encode(query).tolist()
|
||||||
|
|
||||||
|
# Search
|
||||||
|
results = self.collection.query(
|
||||||
|
query_embeddings=[query_embedding],
|
||||||
|
n_results=limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter by score and format
|
||||||
|
relevant = []
|
||||||
|
for i, score in enumerate(results['distances'][0]):
|
||||||
|
if score >= min_score:
|
||||||
|
relevant.append({
|
||||||
|
'content': results['documents'][0][i],
|
||||||
|
'metadata': results['metadatas'][0][i],
|
||||||
|
'score': score,
|
||||||
|
})
|
||||||
|
|
||||||
|
return relevant
|
||||||
|
|
||||||
|
async def add_message(self, message_id: str, role: str, content: str, metadata: dict):
|
||||||
|
"""Add message to RAG index"""
|
||||||
|
embedding = self.embedder.encode(content).tolist()
|
||||||
|
|
||||||
|
self.collection.add(
|
||||||
|
ids=[message_id],
|
||||||
|
embeddings=[embedding],
|
||||||
|
documents=[content],
|
||||||
|
metadatas=[{
|
||||||
|
'role': role,
|
||||||
|
'timestamp': metadata.get('timestamp'),
|
||||||
|
**metadata
|
||||||
|
}]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Conversation Summary Builder
|
||||||
|
|
||||||
|
```python
|
||||||
|
# user-mcp/src/context.py
|
||||||
|
|
||||||
|
async def build_conversation_summary(user_id: str) -> str:
|
||||||
|
"""Build conversation summary with RAG"""
|
||||||
|
# 1. Get recent messages
|
||||||
|
recent_messages = await db.get_messages(
|
||||||
|
user_id=user_id,
|
||||||
|
limit=50,
|
||||||
|
order='desc'
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Get current focus (last user message)
|
||||||
|
last_user_msg = next(
|
||||||
|
(m for m in recent_messages if m.role == 'user'),
|
||||||
|
None
|
||||||
|
)
|
||||||
|
|
||||||
|
if not last_user_msg:
|
||||||
|
return "No recent conversation history."
|
||||||
|
|
||||||
|
# 3. RAG search for relevant context
|
||||||
|
rag = ConversationRAG(f"/data/users/{user_id}/rag")
|
||||||
|
relevant_context = await rag.search_conversation_history(
|
||||||
|
query=last_user_msg.content,
|
||||||
|
limit=5,
|
||||||
|
min_score=0.7
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Build summary
|
||||||
|
summary = f"Recent Conversation Summary:\n\n"
|
||||||
|
|
||||||
|
# Recent messages (last 10)
|
||||||
|
summary += "Last 10 messages:\n"
|
||||||
|
for msg in recent_messages[-10:]:
|
||||||
|
summary += f"- {msg.role}: {msg.content[:100]}...\n"
|
||||||
|
|
||||||
|
# Relevant past context
|
||||||
|
if relevant_context:
|
||||||
|
summary += "\nRelevant past discussions (RAG):\n"
|
||||||
|
for ctx in relevant_context:
|
||||||
|
timestamp = ctx['metadata'].get('timestamp', 'unknown')
|
||||||
|
summary += f"- [{timestamp}] {ctx['content'][:150]}...\n"
|
||||||
|
|
||||||
|
# Inferred focus
|
||||||
|
summary += f"\nCurrent focus: {infer_topic(last_user_msg.content)}\n"
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
def infer_topic(message: str) -> str:
|
||||||
|
"""Simple topic extraction"""
|
||||||
|
keywords = {
|
||||||
|
'strategy': ['strategy', 'backtest', 'trading system'],
|
||||||
|
'indicator': ['indicator', 'rsi', 'macd', 'moving average'],
|
||||||
|
'analysis': ['analyze', 'chart', 'price action'],
|
||||||
|
'risk': ['risk', 'position size', 'stop loss'],
|
||||||
|
}
|
||||||
|
|
||||||
|
message_lower = message.lower()
|
||||||
|
for topic, words in keywords.items():
|
||||||
|
if any(word in message_lower for word in words):
|
||||||
|
return topic
|
||||||
|
|
||||||
|
return 'general trading discussion'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Benefits of This Architecture
|
||||||
|
|
||||||
|
1. **Privacy**: Conversation history never leaves user's container
|
||||||
|
2. **Customization**: Each user controls their RAG, embeddings, prompt engineering
|
||||||
|
3. **Scalability**: Platform harness is stateless - horizontally scalable
|
||||||
|
4. **Cost Control**: Platform pays for Claude, users pay for their compute/storage
|
||||||
|
5. **Portability**: Users can export/migrate their entire context
|
||||||
|
6. **Development**: Users can test prompts/context locally without platform involvement
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
### Dynamic Resource URIs
|
||||||
|
|
||||||
|
Support parameterized resources:
|
||||||
|
```
|
||||||
|
context://conversation/{session_id}
|
||||||
|
context://strategy/{strategy_name}
|
||||||
|
context://backtest/{backtest_id}/results
|
||||||
|
```
|
||||||
|
|
||||||
|
### Resource Templates
|
||||||
|
|
||||||
|
MCP supports resource templates for dynamic discovery:
|
||||||
|
```python
|
||||||
|
@server.list_resource_templates()
|
||||||
|
async def list_templates() -> list[ResourceTemplate]:
|
||||||
|
return [
|
||||||
|
ResourceTemplate(
|
||||||
|
uriTemplate="context://strategy/{name}",
|
||||||
|
name="Strategy Context",
|
||||||
|
description="Context for specific strategy",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Streaming Resources
|
||||||
|
|
||||||
|
For large context (e.g., full backtest results), support streaming:
|
||||||
|
```python
|
||||||
|
@server.read_resource()
|
||||||
|
async def read_resource(uri: str) -> AsyncIterator[str]:
|
||||||
|
if uri.startswith("context://backtest/"):
|
||||||
|
async for chunk in stream_backtest_results(uri):
|
||||||
|
yield chunk
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Migration Path
|
||||||
|
|
||||||
|
For users with existing conversation history in platform DB:
|
||||||
|
|
||||||
|
1. **Export script**: Migrate platform history → user container DB
|
||||||
|
2. **RAG indexing**: Embed all historical messages into ChromaDB
|
||||||
|
3. **Preference migration**: Copy user preferences to container
|
||||||
|
4. **Cutover**: Switch to resource-based context fetching
|
||||||
|
|
||||||
|
Platform can keep read-only archive for compliance, but active context lives in user container.
|
||||||
9
gateway/.dockerignore
Normal file
9
gateway/.dockerignore
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
node_modules
|
||||||
|
dist
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
*.log
|
||||||
|
.git
|
||||||
|
.gitignore
|
||||||
|
README.md
|
||||||
39
gateway/.env.example
Normal file
39
gateway/.env.example
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# Server configuration
|
||||||
|
PORT=3000
|
||||||
|
HOST=0.0.0.0
|
||||||
|
LOG_LEVEL=info
|
||||||
|
CORS_ORIGIN=*
|
||||||
|
|
||||||
|
# Database
|
||||||
|
DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder
|
||||||
|
|
||||||
|
# LLM Provider API Keys (configure at least one)
|
||||||
|
# Anthropic Claude
|
||||||
|
ANTHROPIC_API_KEY=sk-ant-xxxxx
|
||||||
|
|
||||||
|
# OpenAI GPT
|
||||||
|
OPENAI_API_KEY=sk-xxxxx
|
||||||
|
|
||||||
|
# Google Gemini
|
||||||
|
GOOGLE_API_KEY=xxxxx
|
||||||
|
|
||||||
|
# OpenRouter (access to 300+ models with one key)
|
||||||
|
OPENROUTER_API_KEY=sk-or-xxxxx
|
||||||
|
|
||||||
|
# Default model (if user has no preference)
|
||||||
|
DEFAULT_MODEL_PROVIDER=anthropic
|
||||||
|
DEFAULT_MODEL=claude-3-5-sonnet-20241022
|
||||||
|
|
||||||
|
# Telegram (optional)
|
||||||
|
TELEGRAM_BOT_TOKEN=
|
||||||
|
|
||||||
|
# Kubernetes configuration
|
||||||
|
KUBERNETES_NAMESPACE=dexorder-agents
|
||||||
|
KUBERNETES_IN_CLUSTER=false
|
||||||
|
KUBERNETES_CONTEXT=minikube
|
||||||
|
AGENT_IMAGE=ghcr.io/dexorder/agent:latest
|
||||||
|
SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
|
||||||
|
AGENT_STORAGE_CLASS=standard
|
||||||
|
|
||||||
|
# Redis (for session management - future)
|
||||||
|
# REDIS_URL=redis://localhost:6379
|
||||||
6
gateway/.gitignore
vendored
Normal file
6
gateway/.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
node_modules
|
||||||
|
dist
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
*.log
|
||||||
|
.DS_Store
|
||||||
313
gateway/ARCHITECTURE.md
Normal file
313
gateway/ARCHITECTURE.md
Normal file
@@ -0,0 +1,313 @@
|
|||||||
|
# Gateway Architecture: LangChain.js + LangGraph
|
||||||
|
|
||||||
|
## Why LangChain.js (Not Vercel AI SDK or Direct Anthropic SDK)?
|
||||||
|
|
||||||
|
### The Decision
|
||||||
|
|
||||||
|
After evaluating Vercel AI SDK and LangChain.js, we chose **LangChain.js + LangGraph** for these reasons:
|
||||||
|
|
||||||
|
1. **Multi-model support**: 300+ models via OpenRouter, plus direct integrations
|
||||||
|
2. **Complex workflows**: LangGraph for stateful trading analysis pipelines
|
||||||
|
3. **No vendor lock-in**: Switch between Anthropic, OpenAI, Google with one line
|
||||||
|
4. **Streaming**: Same as Vercel AI SDK (`.stream()` method)
|
||||||
|
5. **Tool calling**: Unified across all providers
|
||||||
|
6. **Trading-specific**: State management, conditional branching, human-in-the-loop
|
||||||
|
|
||||||
|
**We don't need Vercel AI SDK because:**
|
||||||
|
- ❌ We use Vue (not React) - don't need React hooks
|
||||||
|
- ❌ We have Node.js servers (not edge) - don't need edge runtime
|
||||||
|
- ✅ **DO need** complex workflows (strategy analysis, backtesting, approvals)
|
||||||
|
- ✅ **DO need** stateful execution (resume from failures)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture Layers
|
||||||
|
|
||||||
|
### Layer 1: Model Abstraction (`src/llm/`)
|
||||||
|
|
||||||
|
**Provider Factory** (`provider.ts`)
|
||||||
|
```typescript
|
||||||
|
const factory = new LLMProviderFactory(config, logger);
|
||||||
|
|
||||||
|
// Create any model
|
||||||
|
const claude = factory.createModel({
|
||||||
|
provider: 'anthropic',
|
||||||
|
model: 'claude-3-5-sonnet-20241022',
|
||||||
|
});
|
||||||
|
|
||||||
|
const gpt4 = factory.createModel({
|
||||||
|
provider: 'openai',
|
||||||
|
model: 'gpt-4o',
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Model Router** (`router.ts`)
|
||||||
|
```typescript
|
||||||
|
const router = new ModelRouter(factory, logger);
|
||||||
|
|
||||||
|
// Intelligently route based on:
|
||||||
|
// - User license (free → Gemini Flash, pro → GPT-4, enterprise → Claude)
|
||||||
|
// - Query complexity (simple → cheap, complex → smart)
|
||||||
|
// - User preference (if set in license.preferredModel)
|
||||||
|
// - Cost optimization (always use cheapest)
|
||||||
|
|
||||||
|
const model = await router.route(
|
||||||
|
message.content,
|
||||||
|
userLicense,
|
||||||
|
RoutingStrategy.COMPLEXITY
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Layer 2: Agent Harness (`src/harness/`)
|
||||||
|
|
||||||
|
**Stateless Orchestrator**
|
||||||
|
|
||||||
|
The harness has **ZERO conversation state**. Everything lives in user's MCP container.
|
||||||
|
|
||||||
|
**Flow:**
|
||||||
|
```typescript
|
||||||
|
async handleMessage(message: InboundMessage) {
|
||||||
|
// 1. Fetch context from user's MCP (resources, not tools)
|
||||||
|
const resources = await mcpClient.listResources();
|
||||||
|
const context = await Promise.all([
|
||||||
|
mcpClient.readResource('context://user-profile'), // Trading style
|
||||||
|
mcpClient.readResource('context://conversation-summary'), // RAG summary
|
||||||
|
mcpClient.readResource('context://workspace-state'), // Current chart
|
||||||
|
mcpClient.readResource('context://system-prompt'), // Custom instructions
|
||||||
|
]);
|
||||||
|
|
||||||
|
// 2. Route to appropriate model
|
||||||
|
const model = await modelRouter.route(message, license);
|
||||||
|
|
||||||
|
// 3. Build messages with embedded context
|
||||||
|
const messages = buildLangChainMessages(systemPrompt, context);
|
||||||
|
|
||||||
|
// 4. Call LLM
|
||||||
|
const response = await model.invoke(messages);
|
||||||
|
|
||||||
|
// 5. Save to user's MCP (tool call)
|
||||||
|
await mcpClient.callTool('save_message', { role: 'user', content: message });
|
||||||
|
await mcpClient.callTool('save_message', { role: 'assistant', content: response });
|
||||||
|
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Streaming variant:**
|
||||||
|
```typescript
|
||||||
|
async *streamMessage(message: InboundMessage) {
|
||||||
|
const model = await modelRouter.route(message, license);
|
||||||
|
const messages = buildMessages(context, message);
|
||||||
|
|
||||||
|
const stream = await model.stream(messages);
|
||||||
|
|
||||||
|
let fullResponse = '';
|
||||||
|
for await (const chunk of stream) {
|
||||||
|
fullResponse += chunk.content;
|
||||||
|
yield chunk.content; // Stream to WebSocket/Telegram
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save after streaming completes
|
||||||
|
await mcpClient.callTool('save_message', { /* ... */ });
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Layer 3: Workflows (`src/workflows/`)
|
||||||
|
|
||||||
|
**LangGraph for Complex Trading Analysis**
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// Example: Strategy Analysis Pipeline
|
||||||
|
const workflow = new StateGraph(StrategyAnalysisState)
|
||||||
|
.addNode('code_review', async (state) => {
|
||||||
|
const model = new ChatAnthropic({ model: 'claude-3-opus' });
|
||||||
|
const review = await model.invoke(`Review: ${state.strategyCode}`);
|
||||||
|
return { codeReview: review.content };
|
||||||
|
})
|
||||||
|
.addNode('backtest', async (state) => {
|
||||||
|
// Call user's MCP backtest tool
|
||||||
|
const results = await mcpClient.callTool('run_backtest', {
|
||||||
|
strategy: state.strategyCode,
|
||||||
|
ticker: state.ticker,
|
||||||
|
});
|
||||||
|
return { backtestResults: results };
|
||||||
|
})
|
||||||
|
.addNode('risk_assessment', async (state) => {
|
||||||
|
const model = new ChatAnthropic({ model: 'claude-3-5-sonnet' });
|
||||||
|
const assessment = await model.invoke(
|
||||||
|
`Analyze risk: ${JSON.stringify(state.backtestResults)}`
|
||||||
|
);
|
||||||
|
return { riskAssessment: assessment.content };
|
||||||
|
})
|
||||||
|
.addNode('human_approval', async (state) => {
|
||||||
|
// Pause for user review (human-in-the-loop)
|
||||||
|
return { humanApproved: await waitForUserApproval(state) };
|
||||||
|
})
|
||||||
|
.addConditionalEdges('human_approval', (state) => {
|
||||||
|
return state.humanApproved ? 'deploy' : 'reject';
|
||||||
|
})
|
||||||
|
.compile();
|
||||||
|
|
||||||
|
// Execute
|
||||||
|
const result = await workflow.invoke({
|
||||||
|
strategyCode: userCode,
|
||||||
|
ticker: 'BTC/USDT',
|
||||||
|
timeframe: '1h',
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- **Stateful**: Resume if server crashes mid-analysis
|
||||||
|
- **Conditional**: Route based on results (if Sharpe > 2 → deploy, else → reject)
|
||||||
|
- **Human-in-the-loop**: Pause for user approval
|
||||||
|
- **Multi-step**: Each node can use different models
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## User Context Architecture
|
||||||
|
|
||||||
|
### MCP Resources (Not Tools)
|
||||||
|
|
||||||
|
**User's MCP server exposes resources** (read-only context):
|
||||||
|
|
||||||
|
```
|
||||||
|
context://user-profile → Trading style, preferences
|
||||||
|
context://conversation-summary → RAG-generated summary
|
||||||
|
context://workspace-state → Current chart, positions
|
||||||
|
context://system-prompt → User's custom AI instructions
|
||||||
|
```
|
||||||
|
|
||||||
|
**Gateway fetches and embeds in LLM call:**
|
||||||
|
```typescript
|
||||||
|
const userProfile = await mcpClient.readResource('context://user-profile');
|
||||||
|
const conversationSummary = await mcpClient.readResource('context://conversation-summary');
|
||||||
|
|
||||||
|
// User's MCP server runs RAG search and returns summary
|
||||||
|
// Gateway embeds this in Claude/GPT prompt
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why resources, not tools?**
|
||||||
|
- Resources = context injection (read-only)
|
||||||
|
- Tools = actions (write operations)
|
||||||
|
- Context should be fetched **before** LLM call, not during
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Model Routing Strategies
|
||||||
|
|
||||||
|
### 1. User Preference
|
||||||
|
```typescript
|
||||||
|
// User's license has preferred model
|
||||||
|
{
|
||||||
|
"preferredModel": {
|
||||||
|
"provider": "anthropic",
|
||||||
|
"model": "claude-3-5-sonnet-20241022"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Router uses this if set
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Complexity-Based
|
||||||
|
```typescript
|
||||||
|
const isComplex = message.includes('backtest') || message.length > 200;
|
||||||
|
|
||||||
|
if (isComplex) {
|
||||||
|
return { provider: 'anthropic', model: 'claude-3-opus' }; // Smart
|
||||||
|
} else {
|
||||||
|
return { provider: 'openai', model: 'gpt-4o-mini' }; // Fast
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. License Tier
|
||||||
|
```typescript
|
||||||
|
switch (license.licenseType) {
|
||||||
|
case 'free':
|
||||||
|
return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Cheap
|
||||||
|
case 'pro':
|
||||||
|
return { provider: 'openai', model: 'gpt-4o' }; // Balanced
|
||||||
|
case 'enterprise':
|
||||||
|
return { provider: 'anthropic', model: 'claude-3-5-sonnet' }; // Premium
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Cost-Optimized
|
||||||
|
```typescript
|
||||||
|
return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Always cheapest
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## When to Use What
|
||||||
|
|
||||||
|
### Simple Chat → Agent Harness
|
||||||
|
```typescript
|
||||||
|
// User: "What's the RSI on BTC?"
|
||||||
|
// → Fast streaming response via harness.streamMessage()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Complex Analysis → LangGraph Workflow
|
||||||
|
```typescript
|
||||||
|
// User: "Analyze this strategy and backtest it"
|
||||||
|
// → Multi-step workflow: code review → backtest → risk → approval
|
||||||
|
```
|
||||||
|
|
||||||
|
### Direct Tool Call → MCP Client
|
||||||
|
```typescript
|
||||||
|
// User: "Get my watchlist"
|
||||||
|
// → Direct MCP tool call, no LLM needed
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
User Message ("Analyze my strategy")
|
||||||
|
↓
|
||||||
|
Gateway → Route to workflow (not harness)
|
||||||
|
↓
|
||||||
|
LangGraph Workflow:
|
||||||
|
├─ Node 1: Code Review (Claude Opus)
|
||||||
|
│ └─ Analyzes strategy code
|
||||||
|
├─ Node 2: Backtest (MCP tool call)
|
||||||
|
│ └─ User's container runs backtest
|
||||||
|
├─ Node 3: Risk Assessment (Claude Sonnet)
|
||||||
|
│ └─ Evaluates results
|
||||||
|
├─ Node 4: Human Approval (pause)
|
||||||
|
│ └─ User reviews in UI
|
||||||
|
└─ Node 5: Recommendation (GPT-4o-mini)
|
||||||
|
└─ Final decision
|
||||||
|
|
||||||
|
Result → Return to user
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Benefits Summary
|
||||||
|
|
||||||
|
| Feature | LangChain.js | Vercel AI SDK | Direct Anthropic SDK |
|
||||||
|
|---------|--------------|---------------|----------------------|
|
||||||
|
| Multi-model | ✅ 300+ models | ✅ 100+ models | ❌ Anthropic only |
|
||||||
|
| Streaming | ✅ `.stream()` | ✅ `streamText()` | ✅ `.stream()` |
|
||||||
|
| Tool calling | ✅ Unified | ✅ Unified | ✅ Anthropic format |
|
||||||
|
| Complex workflows | ✅ LangGraph | ❌ Limited | ❌ DIY |
|
||||||
|
| Stateful agents | ✅ LangGraph | ❌ No | ❌ No |
|
||||||
|
| Human-in-the-loop | ✅ LangGraph | ❌ No | ❌ No |
|
||||||
|
| React hooks | ❌ N/A | ✅ `useChat()` | ❌ N/A |
|
||||||
|
| Bundle size | Large (101kb) | Small (30kb) | Medium (60kb) |
|
||||||
|
| **Dexorder needs** | **✅ Perfect fit** | **❌ Missing workflows** | **❌ Vendor lock-in** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. **Implement tool calling** in agent harness (bind MCP tools to LangChain)
|
||||||
|
2. **Add state persistence** for LangGraph (PostgreSQL checkpointer)
|
||||||
|
3. **Build more workflows**: market scanner, portfolio optimizer
|
||||||
|
4. **Add monitoring**: Track model usage, costs, latency
|
||||||
|
5. **User container**: Implement Python MCP server with resources
|
||||||
40
gateway/Dockerfile
Normal file
40
gateway/Dockerfile
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
FROM node:22-alpine AS builder
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy package files
|
||||||
|
COPY package*.json ./
|
||||||
|
COPY tsconfig.json ./
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN npm ci
|
||||||
|
|
||||||
|
# Copy source
|
||||||
|
COPY src ./src
|
||||||
|
|
||||||
|
# Build
|
||||||
|
RUN npm run build
|
||||||
|
|
||||||
|
# Production image
|
||||||
|
FROM node:22-alpine
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy package files
|
||||||
|
COPY package*.json ./
|
||||||
|
|
||||||
|
# Install production dependencies only
|
||||||
|
RUN npm ci --omit=dev
|
||||||
|
|
||||||
|
# Copy built application
|
||||||
|
COPY --from=builder /app/dist ./dist
|
||||||
|
|
||||||
|
# Create non-root user
|
||||||
|
RUN addgroup -g 1001 -S nodejs && \
|
||||||
|
adduser -S nodejs -u 1001
|
||||||
|
|
||||||
|
USER nodejs
|
||||||
|
|
||||||
|
EXPOSE 3000
|
||||||
|
|
||||||
|
CMD ["node", "dist/main.js"]
|
||||||
212
gateway/README.md
Normal file
212
gateway/README.md
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
# Dexorder Gateway
|
||||||
|
|
||||||
|
Multi-channel gateway with agent harness for the Dexorder AI platform.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────┐
|
||||||
|
│ Platform Gateway │
|
||||||
|
│ (Node.js/Fastify) │
|
||||||
|
│ │
|
||||||
|
│ ┌────────────────────────────────────────────────┐ │
|
||||||
|
│ │ Channels │ │
|
||||||
|
│ │ - WebSocket (/ws/chat) │ │
|
||||||
|
│ │ - Telegram Webhook (/webhook/telegram) │ │
|
||||||
|
│ └────────────────────────────────────────────────┘ │
|
||||||
|
│ ↕ │
|
||||||
|
│ ┌────────────────────────────────────────────────┐ │
|
||||||
|
│ │ Authenticator │ │
|
||||||
|
│ │ - JWT verification (WebSocket) │ │
|
||||||
|
│ │ - Channel linking (Telegram) │ │
|
||||||
|
│ │ - User license lookup (PostgreSQL) │ │
|
||||||
|
│ └────────────────────────────────────────────────┘ │
|
||||||
|
│ ↕ │
|
||||||
|
│ ┌────────────────────────────────────────────────┐ │
|
||||||
|
│ │ Agent Harness (per-session) │ │
|
||||||
|
│ │ - Claude API integration │ │
|
||||||
|
│ │ - MCP client connector │ │
|
||||||
|
│ │ - Conversation state │ │
|
||||||
|
│ └────────────────────────────────────────────────┘ │
|
||||||
|
│ ↕ │
|
||||||
|
│ ┌────────────────────────────────────────────────┐ │
|
||||||
|
│ │ MCP Client │ │
|
||||||
|
│ │ - User container connection │ │
|
||||||
|
│ │ - Tool routing │ │
|
||||||
|
│ └────────────────────────────────────────────────┘ │
|
||||||
|
└─────────────────────────────────────────────────────────┘
|
||||||
|
↕
|
||||||
|
┌───────────────────────────────┐
|
||||||
|
│ User MCP Server (Python) │
|
||||||
|
│ - Strategies, indicators │
|
||||||
|
│ - Memory, preferences │
|
||||||
|
│ - Backtest sandbox │
|
||||||
|
└───────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Automatic container provisioning**: Creates user agent containers on-demand via Kubernetes
|
||||||
|
- **Multi-channel support**: WebSocket and Telegram webhooks
|
||||||
|
- **Per-channel authentication**: JWT for web, channel linking for chat apps
|
||||||
|
- **User license management**: Feature flags and resource limits from PostgreSQL
|
||||||
|
- **Container lifecycle management**: Auto-shutdown on idle (handled by container sidecar)
|
||||||
|
- **License-based resources**: Different memory/CPU/storage limits per tier
|
||||||
|
- **Multi-model LLM support**: Anthropic Claude, OpenAI GPT, Google Gemini, OpenRouter (300+ models)
|
||||||
|
- **Zero vendor lock-in**: Switch models with one line, powered by LangChain.js
|
||||||
|
- **Intelligent routing**: Auto-select models based on complexity, license tier, or user preference
|
||||||
|
- **Streaming responses**: Real-time chat with WebSocket and Telegram
|
||||||
|
- **Complex workflows**: LangGraph for stateful trading analysis (backtest → risk → approval)
|
||||||
|
- **Agent harness**: Stateless orchestrator (all context lives in user's MCP container)
|
||||||
|
- **MCP resource integration**: User's RAG, conversation history, and preferences
|
||||||
|
|
||||||
|
## Container Management
|
||||||
|
|
||||||
|
When a user authenticates, the gateway:
|
||||||
|
|
||||||
|
1. **Checks for existing container**: Queries Kubernetes for deployment
|
||||||
|
2. **Creates if missing**: Renders YAML template based on license tier
|
||||||
|
3. **Waits for ready**: Polls deployment status until healthy
|
||||||
|
4. **Returns MCP endpoint**: Computed from service name
|
||||||
|
5. **Connects to MCP server**: Proceeds with normal authentication flow
|
||||||
|
|
||||||
|
Container templates by license tier:
|
||||||
|
|
||||||
|
| Tier | Memory | CPU | Storage | Idle Timeout |
|
||||||
|
|------|--------|-----|---------|--------------|
|
||||||
|
| Free | 512Mi | 500m | 1Gi | 15min |
|
||||||
|
| Pro | 2Gi | 2000m | 10Gi | 60min |
|
||||||
|
| Enterprise | 4Gi | 4000m | 50Gi | Never |
|
||||||
|
|
||||||
|
Containers self-manage their lifecycle using the lifecycle sidecar (see `../lifecycle-sidecar/`)
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- Node.js >= 22.0.0
|
||||||
|
- PostgreSQL database
|
||||||
|
- At least one LLM provider API key:
|
||||||
|
- Anthropic Claude
|
||||||
|
- OpenAI GPT
|
||||||
|
- Google Gemini
|
||||||
|
- OpenRouter (one key for 300+ models)
|
||||||
|
|
||||||
|
### Development
|
||||||
|
|
||||||
|
1. Install dependencies:
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Copy environment template:
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Configure `.env` (see `.env.example`):
|
||||||
|
```bash
|
||||||
|
DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder
|
||||||
|
|
||||||
|
# Configure at least one provider
|
||||||
|
ANTHROPIC_API_KEY=sk-ant-xxxxx
|
||||||
|
# OPENAI_API_KEY=sk-xxxxx
|
||||||
|
# GOOGLE_API_KEY=xxxxx
|
||||||
|
# OPENROUTER_API_KEY=sk-or-xxxxx
|
||||||
|
|
||||||
|
# Optional: Set default model
|
||||||
|
DEFAULT_MODEL_PROVIDER=anthropic
|
||||||
|
DEFAULT_MODEL=claude-3-5-sonnet-20241022
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Run development server:
|
||||||
|
```bash
|
||||||
|
npm run dev
|
||||||
|
```
|
||||||
|
|
||||||
|
### Production Build
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run build
|
||||||
|
npm start
|
||||||
|
```
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t dexorder/gateway:latest .
|
||||||
|
docker run -p 3000:3000 --env-file .env dexorder/gateway:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database Schema
|
||||||
|
|
||||||
|
Required PostgreSQL tables (will be documented separately):
|
||||||
|
|
||||||
|
### `user_licenses`
|
||||||
|
- `user_id` (text, primary key)
|
||||||
|
- `email` (text)
|
||||||
|
- `license_type` (text: 'free', 'pro', 'enterprise')
|
||||||
|
- `features` (jsonb)
|
||||||
|
- `resource_limits` (jsonb)
|
||||||
|
- `mcp_server_url` (text)
|
||||||
|
- `expires_at` (timestamp, nullable)
|
||||||
|
- `created_at` (timestamp)
|
||||||
|
- `updated_at` (timestamp)
|
||||||
|
|
||||||
|
### `user_channel_links`
|
||||||
|
- `id` (serial, primary key)
|
||||||
|
- `user_id` (text, foreign key)
|
||||||
|
- `channel_type` (text: 'telegram', 'slack', 'discord')
|
||||||
|
- `channel_user_id` (text)
|
||||||
|
- `created_at` (timestamp)
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### WebSocket
|
||||||
|
|
||||||
|
**`GET /ws/chat`**
|
||||||
|
- WebSocket connection for web client
|
||||||
|
- Auth: Bearer token in headers
|
||||||
|
- Protocol: JSON messages
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```javascript
|
||||||
|
const ws = new WebSocket('ws://localhost:3000/ws/chat', {
|
||||||
|
headers: {
|
||||||
|
'Authorization': 'Bearer your-jwt-token'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
ws.on('message', (data) => {
|
||||||
|
const msg = JSON.parse(data);
|
||||||
|
console.log(msg);
|
||||||
|
});
|
||||||
|
|
||||||
|
ws.send(JSON.stringify({
|
||||||
|
type: 'message',
|
||||||
|
content: 'Hello, AI!'
|
||||||
|
}));
|
||||||
|
```
|
||||||
|
|
||||||
|
### Telegram Webhook
|
||||||
|
|
||||||
|
**`POST /webhook/telegram`**
|
||||||
|
- Telegram bot webhook endpoint
|
||||||
|
- Auth: Telegram user linked to platform user
|
||||||
|
- Automatically processes incoming messages
|
||||||
|
|
||||||
|
### Health Check
|
||||||
|
|
||||||
|
**`GET /health`**
|
||||||
|
- Returns server health status
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
- [ ] Implement JWT verification with JWKS
|
||||||
|
- [ ] Implement MCP HTTP/SSE transport
|
||||||
|
- [ ] Add Redis for session persistence
|
||||||
|
- [ ] Add rate limiting per user license
|
||||||
|
- [ ] Add message usage tracking
|
||||||
|
- [ ] Add streaming responses for WebSocket
|
||||||
|
- [ ] Add Slack and Discord channel handlers
|
||||||
|
- [ ] Add session cleanup/timeout logic
|
||||||
42
gateway/package.json
Normal file
42
gateway/package.json
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
"name": "@dexorder/gateway",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"type": "module",
|
||||||
|
"private": true,
|
||||||
|
"description": "Multi-channel gateway with agent harness for Dexorder AI platform",
|
||||||
|
"scripts": {
|
||||||
|
"dev": "tsx watch src/main.ts",
|
||||||
|
"build": "tsc",
|
||||||
|
"start": "node dist/main.js",
|
||||||
|
"typecheck": "tsc --noEmit"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@fastify/cors": "^10.0.1",
|
||||||
|
"@fastify/websocket": "^11.0.1",
|
||||||
|
"@kubernetes/client-node": "^0.21.0",
|
||||||
|
"@langchain/anthropic": "^0.3.8",
|
||||||
|
"@langchain/core": "^0.3.24",
|
||||||
|
"@langchain/google-genai": "^0.1.6",
|
||||||
|
"@langchain/langgraph": "^0.2.26",
|
||||||
|
"@langchain/openai": "^0.3.21",
|
||||||
|
"@langchain/openrouter": "^0.1.2",
|
||||||
|
"@modelcontextprotocol/sdk": "^1.0.4",
|
||||||
|
"fastify": "^5.2.0",
|
||||||
|
"ioredis": "^5.4.2",
|
||||||
|
"js-yaml": "^4.1.0",
|
||||||
|
"pg": "^8.13.1",
|
||||||
|
"pino": "^9.6.0",
|
||||||
|
"pino-pretty": "^13.0.0",
|
||||||
|
"zod": "^3.24.1"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/js-yaml": "^4.0.9",
|
||||||
|
"@types/node": "^22.10.2",
|
||||||
|
"@types/pg": "^8.11.10",
|
||||||
|
"tsx": "^4.19.2",
|
||||||
|
"typescript": "^5.7.2"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=22.0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
79
gateway/schema.sql
Normal file
79
gateway/schema.sql
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
-- User license and authorization schema
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS user_licenses (
|
||||||
|
user_id TEXT PRIMARY KEY,
|
||||||
|
email TEXT,
|
||||||
|
license_type TEXT NOT NULL CHECK (license_type IN ('free', 'pro', 'enterprise')),
|
||||||
|
features JSONB NOT NULL DEFAULT '{
|
||||||
|
"maxIndicators": 5,
|
||||||
|
"maxStrategies": 3,
|
||||||
|
"maxBacktestDays": 30,
|
||||||
|
"realtimeData": false,
|
||||||
|
"customExecutors": false,
|
||||||
|
"apiAccess": false
|
||||||
|
}',
|
||||||
|
resource_limits JSONB NOT NULL DEFAULT '{
|
||||||
|
"maxConcurrentSessions": 1,
|
||||||
|
"maxMessagesPerDay": 100,
|
||||||
|
"maxTokensPerMessage": 4096,
|
||||||
|
"rateLimitPerMinute": 10
|
||||||
|
}',
|
||||||
|
mcp_server_url TEXT NOT NULL,
|
||||||
|
preferred_model JSONB DEFAULT NULL,
|
||||||
|
expires_at TIMESTAMP WITH TIME ZONE,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
COMMENT ON COLUMN user_licenses.preferred_model IS 'Optional model preference: {"provider": "anthropic", "model": "claude-3-5-sonnet-20241022", "temperature": 0.7}';
|
||||||
|
|
||||||
|
CREATE INDEX idx_user_licenses_expires_at ON user_licenses(expires_at)
|
||||||
|
WHERE expires_at IS NOT NULL;
|
||||||
|
|
||||||
|
-- Channel linking for multi-channel support
|
||||||
|
CREATE TABLE IF NOT EXISTS user_channel_links (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
user_id TEXT NOT NULL REFERENCES user_licenses(user_id) ON DELETE CASCADE,
|
||||||
|
channel_type TEXT NOT NULL CHECK (channel_type IN ('telegram', 'slack', 'discord', 'websocket')),
|
||||||
|
channel_user_id TEXT NOT NULL,
|
||||||
|
metadata JSONB,
|
||||||
|
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
|
||||||
|
UNIQUE(channel_type, channel_user_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_user_channel_links_user_id ON user_channel_links(user_id);
|
||||||
|
CREATE INDEX idx_user_channel_links_channel ON user_channel_links(channel_type, channel_user_id);
|
||||||
|
|
||||||
|
-- Example data for development
|
||||||
|
INSERT INTO user_licenses (user_id, email, license_type, mcp_server_url, features, resource_limits, preferred_model)
|
||||||
|
VALUES (
|
||||||
|
'dev-user-001',
|
||||||
|
'dev@example.com',
|
||||||
|
'pro',
|
||||||
|
'http://localhost:8080/mcp',
|
||||||
|
'{
|
||||||
|
"maxIndicators": 50,
|
||||||
|
"maxStrategies": 20,
|
||||||
|
"maxBacktestDays": 365,
|
||||||
|
"realtimeData": true,
|
||||||
|
"customExecutors": true,
|
||||||
|
"apiAccess": true
|
||||||
|
}',
|
||||||
|
'{
|
||||||
|
"maxConcurrentSessions": 5,
|
||||||
|
"maxMessagesPerDay": 1000,
|
||||||
|
"maxTokensPerMessage": 8192,
|
||||||
|
"rateLimitPerMinute": 60
|
||||||
|
}',
|
||||||
|
'{
|
||||||
|
"provider": "anthropic",
|
||||||
|
"model": "claude-3-5-sonnet-20241022",
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
)
|
||||||
|
ON CONFLICT (user_id) DO NOTHING;
|
||||||
|
|
||||||
|
-- Example Telegram link
|
||||||
|
INSERT INTO user_channel_links (user_id, channel_type, channel_user_id)
|
||||||
|
VALUES ('dev-user-001', 'telegram', '123456789')
|
||||||
|
ON CONFLICT (channel_type, channel_user_id) DO NOTHING;
|
||||||
146
gateway/src/auth/authenticator.ts
Normal file
146
gateway/src/auth/authenticator.ts
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
import type { FastifyRequest, FastifyBaseLogger } from 'fastify';
|
||||||
|
import { UserService } from '../db/user-service.js';
|
||||||
|
import { ChannelType, type AuthContext } from '../types/user.js';
|
||||||
|
import type { ContainerManager } from '../k8s/container-manager.js';
|
||||||
|
|
||||||
|
export interface AuthenticatorConfig {
|
||||||
|
userService: UserService;
|
||||||
|
containerManager: ContainerManager;
|
||||||
|
logger: FastifyBaseLogger;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Multi-channel authenticator
|
||||||
|
* Handles authentication for WebSocket, Telegram, and other channels
|
||||||
|
*/
|
||||||
|
export class Authenticator {
|
||||||
|
private config: AuthenticatorConfig;
|
||||||
|
|
||||||
|
constructor(config: AuthenticatorConfig) {
|
||||||
|
this.config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Authenticate WebSocket connection via JWT token
|
||||||
|
* Also ensures the user's container is running
|
||||||
|
*/
|
||||||
|
async authenticateWebSocket(
|
||||||
|
request: FastifyRequest
|
||||||
|
): Promise<AuthContext | null> {
|
||||||
|
try {
|
||||||
|
const token = this.extractBearerToken(request);
|
||||||
|
if (!token) {
|
||||||
|
this.config.logger.warn('No bearer token in WebSocket connection');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const userId = await this.config.userService.verifyWebToken(token);
|
||||||
|
if (!userId) {
|
||||||
|
this.config.logger.warn('Invalid JWT token');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const license = await this.config.userService.getUserLicense(userId);
|
||||||
|
if (!license) {
|
||||||
|
this.config.logger.warn({ userId }, 'User license not found');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure container is running (may take time if creating new container)
|
||||||
|
this.config.logger.info({ userId }, 'Ensuring user container is running');
|
||||||
|
const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning(
|
||||||
|
userId,
|
||||||
|
license
|
||||||
|
);
|
||||||
|
|
||||||
|
this.config.logger.info(
|
||||||
|
{ userId, mcpEndpoint, wasCreated },
|
||||||
|
'Container is ready'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update license with actual MCP endpoint
|
||||||
|
license.mcpServerUrl = mcpEndpoint;
|
||||||
|
|
||||||
|
const sessionId = `ws_${userId}_${Date.now()}`;
|
||||||
|
|
||||||
|
return {
|
||||||
|
userId,
|
||||||
|
channelType: ChannelType.WEBSOCKET,
|
||||||
|
channelUserId: userId, // For WebSocket, same as userId
|
||||||
|
sessionId,
|
||||||
|
license,
|
||||||
|
authenticatedAt: new Date(),
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error }, 'WebSocket authentication error');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Authenticate Telegram webhook
|
||||||
|
* Also ensures the user's container is running
|
||||||
|
*/
|
||||||
|
async authenticateTelegram(telegramUserId: string): Promise<AuthContext | null> {
|
||||||
|
try {
|
||||||
|
const userId = await this.config.userService.getUserIdFromChannel(
|
||||||
|
'telegram',
|
||||||
|
telegramUserId
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!userId) {
|
||||||
|
this.config.logger.warn(
|
||||||
|
{ telegramUserId },
|
||||||
|
'Telegram user not linked to platform user'
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const license = await this.config.userService.getUserLicense(userId);
|
||||||
|
if (!license) {
|
||||||
|
this.config.logger.warn({ userId }, 'User license not found');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure container is running
|
||||||
|
this.config.logger.info({ userId }, 'Ensuring user container is running');
|
||||||
|
const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning(
|
||||||
|
userId,
|
||||||
|
license
|
||||||
|
);
|
||||||
|
|
||||||
|
this.config.logger.info(
|
||||||
|
{ userId, mcpEndpoint, wasCreated },
|
||||||
|
'Container is ready'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update license with actual MCP endpoint
|
||||||
|
license.mcpServerUrl = mcpEndpoint;
|
||||||
|
|
||||||
|
const sessionId = `tg_${telegramUserId}_${Date.now()}`;
|
||||||
|
|
||||||
|
return {
|
||||||
|
userId,
|
||||||
|
channelType: ChannelType.TELEGRAM,
|
||||||
|
channelUserId: telegramUserId,
|
||||||
|
sessionId,
|
||||||
|
license,
|
||||||
|
authenticatedAt: new Date(),
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error }, 'Telegram authentication error');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract bearer token from request headers
|
||||||
|
*/
|
||||||
|
private extractBearerToken(request: FastifyRequest): string | null {
|
||||||
|
const auth = request.headers.authorization;
|
||||||
|
if (!auth || !auth.startsWith('Bearer ')) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return auth.substring(7);
|
||||||
|
}
|
||||||
|
}
|
||||||
163
gateway/src/channels/telegram-handler.ts
Normal file
163
gateway/src/channels/telegram-handler.ts
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
|
||||||
|
import type { Authenticator } from '../auth/authenticator.js';
|
||||||
|
import { AgentHarness } from '../harness/agent-harness.js';
|
||||||
|
import type { InboundMessage } from '../types/messages.js';
|
||||||
|
import { randomUUID } from 'crypto';
|
||||||
|
|
||||||
|
import type { ProviderConfig } from '../llm/provider.js';
|
||||||
|
|
||||||
|
export interface TelegramHandlerConfig {
|
||||||
|
authenticator: Authenticator;
|
||||||
|
providerConfig: ProviderConfig;
|
||||||
|
telegramBotToken: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TelegramUpdate {
|
||||||
|
update_id: number;
|
||||||
|
message?: {
|
||||||
|
message_id: number;
|
||||||
|
from: {
|
||||||
|
id: number;
|
||||||
|
first_name: string;
|
||||||
|
username?: string;
|
||||||
|
};
|
||||||
|
chat: {
|
||||||
|
id: number;
|
||||||
|
type: string;
|
||||||
|
};
|
||||||
|
text?: string;
|
||||||
|
photo?: Array<{
|
||||||
|
file_id: string;
|
||||||
|
file_size: number;
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Telegram webhook handler
|
||||||
|
*/
|
||||||
|
export class TelegramHandler {
|
||||||
|
private config: TelegramHandlerConfig;
|
||||||
|
private sessions = new Map<string, AgentHarness>();
|
||||||
|
|
||||||
|
constructor(config: TelegramHandlerConfig) {
|
||||||
|
this.config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Register Telegram webhook routes
|
||||||
|
*/
|
||||||
|
register(app: FastifyInstance): void {
|
||||||
|
app.post('/webhook/telegram', async (request: FastifyRequest, reply: FastifyReply) => {
|
||||||
|
await this.handleWebhook(request, reply, app);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handle Telegram webhook
|
||||||
|
*/
|
||||||
|
private async handleWebhook(
|
||||||
|
request: FastifyRequest,
|
||||||
|
reply: FastifyReply,
|
||||||
|
app: FastifyInstance
|
||||||
|
): Promise<void> {
|
||||||
|
const logger = app.log;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const update = request.body as TelegramUpdate;
|
||||||
|
|
||||||
|
if (!update.message?.text) {
|
||||||
|
// Ignore non-text messages for now
|
||||||
|
reply.code(200).send({ ok: true });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const telegramUserId = update.message.from.id.toString();
|
||||||
|
const chatId = update.message.chat.id;
|
||||||
|
const text = update.message.text;
|
||||||
|
|
||||||
|
logger.info({ telegramUserId, chatId, text }, 'Received Telegram message');
|
||||||
|
|
||||||
|
// Authenticate
|
||||||
|
const authContext = await this.config.authenticator.authenticateTelegram(telegramUserId);
|
||||||
|
if (!authContext) {
|
||||||
|
logger.warn({ telegramUserId }, 'Telegram user not authenticated');
|
||||||
|
await this.sendTelegramMessage(
|
||||||
|
chatId,
|
||||||
|
'Please link your Telegram account to Dexorder first.'
|
||||||
|
);
|
||||||
|
reply.code(200).send({ ok: true });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get or create harness
|
||||||
|
let harness = this.sessions.get(authContext.sessionId);
|
||||||
|
if (!harness) {
|
||||||
|
harness = new AgentHarness({
|
||||||
|
userId: authContext.userId,
|
||||||
|
sessionId: authContext.sessionId,
|
||||||
|
license: authContext.license,
|
||||||
|
providerConfig: this.config.providerConfig,
|
||||||
|
logger,
|
||||||
|
});
|
||||||
|
await harness.initialize();
|
||||||
|
this.sessions.set(authContext.sessionId, harness);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process message
|
||||||
|
const inboundMessage: InboundMessage = {
|
||||||
|
messageId: randomUUID(),
|
||||||
|
userId: authContext.userId,
|
||||||
|
sessionId: authContext.sessionId,
|
||||||
|
content: text,
|
||||||
|
timestamp: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const response = await harness.handleMessage(inboundMessage);
|
||||||
|
|
||||||
|
// Send response back to Telegram
|
||||||
|
await this.sendTelegramMessage(chatId, response.content);
|
||||||
|
|
||||||
|
reply.code(200).send({ ok: true });
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error }, 'Error handling Telegram webhook');
|
||||||
|
reply.code(500).send({ ok: false, error: 'Internal server error' });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Send message to Telegram chat
|
||||||
|
*/
|
||||||
|
private async sendTelegramMessage(chatId: number, text: string): Promise<void> {
|
||||||
|
const url = `https://api.telegram.org/bot${this.config.telegramBotToken}/sendMessage`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
chat_id: chatId,
|
||||||
|
text,
|
||||||
|
parse_mode: 'Markdown',
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`Telegram API error: ${response.statusText}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to send Telegram message:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleanup old sessions (call periodically)
|
||||||
|
*/
|
||||||
|
async cleanupSessions(maxAgeMs = 30 * 60 * 1000): Promise<void> {
|
||||||
|
// TODO: Track session last activity and cleanup
|
||||||
|
// For now, sessions persist until server restart
|
||||||
|
}
|
||||||
|
}
|
||||||
161
gateway/src/channels/websocket-handler.ts
Normal file
161
gateway/src/channels/websocket-handler.ts
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
import type { FastifyInstance, FastifyRequest } from 'fastify';
|
||||||
|
import type { WebSocket } from '@fastify/websocket';
|
||||||
|
import type { Authenticator } from '../auth/authenticator.js';
|
||||||
|
import { AgentHarness } from '../harness/agent-harness.js';
|
||||||
|
import type { InboundMessage } from '../types/messages.js';
|
||||||
|
import { randomUUID } from 'crypto';
|
||||||
|
|
||||||
|
import type { ProviderConfig } from '../llm/provider.js';
|
||||||
|
|
||||||
|
export interface WebSocketHandlerConfig {
|
||||||
|
authenticator: Authenticator;
|
||||||
|
providerConfig: ProviderConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WebSocket channel handler
|
||||||
|
*/
|
||||||
|
export class WebSocketHandler {
|
||||||
|
private config: WebSocketHandlerConfig;
|
||||||
|
private sessions = new Map<string, AgentHarness>();
|
||||||
|
|
||||||
|
constructor(config: WebSocketHandlerConfig) {
|
||||||
|
this.config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Register WebSocket routes
|
||||||
|
*/
|
||||||
|
register(app: FastifyInstance): void {
|
||||||
|
app.get(
|
||||||
|
'/ws/chat',
|
||||||
|
{ websocket: true },
|
||||||
|
async (socket: WebSocket, request: FastifyRequest) => {
|
||||||
|
await this.handleConnection(socket, request, app);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handle WebSocket connection
|
||||||
|
*/
|
||||||
|
private async handleConnection(
|
||||||
|
socket: WebSocket,
|
||||||
|
request: FastifyRequest,
|
||||||
|
app: FastifyInstance
|
||||||
|
): Promise<void> {
|
||||||
|
const logger = app.log;
|
||||||
|
|
||||||
|
// Send initial connecting message
|
||||||
|
socket.send(
|
||||||
|
JSON.stringify({
|
||||||
|
type: 'status',
|
||||||
|
status: 'authenticating',
|
||||||
|
message: 'Authenticating...',
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Authenticate (this may take time if creating container)
|
||||||
|
const authContext = await this.config.authenticator.authenticateWebSocket(request);
|
||||||
|
if (!authContext) {
|
||||||
|
logger.warn('WebSocket authentication failed');
|
||||||
|
socket.send(
|
||||||
|
JSON.stringify({
|
||||||
|
type: 'error',
|
||||||
|
message: 'Authentication failed',
|
||||||
|
})
|
||||||
|
);
|
||||||
|
socket.close(1008, 'Authentication failed');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
{ userId: authContext.userId, sessionId: authContext.sessionId },
|
||||||
|
'WebSocket connection authenticated'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Send workspace starting message
|
||||||
|
socket.send(
|
||||||
|
JSON.stringify({
|
||||||
|
type: 'status',
|
||||||
|
status: 'initializing',
|
||||||
|
message: 'Starting your workspace...',
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create agent harness
|
||||||
|
const harness = new AgentHarness({
|
||||||
|
userId: authContext.userId,
|
||||||
|
sessionId: authContext.sessionId,
|
||||||
|
license: authContext.license,
|
||||||
|
providerConfig: this.config.providerConfig,
|
||||||
|
logger,
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
await harness.initialize();
|
||||||
|
this.sessions.set(authContext.sessionId, harness);
|
||||||
|
|
||||||
|
// Send connected message
|
||||||
|
socket.send(
|
||||||
|
JSON.stringify({
|
||||||
|
type: 'connected',
|
||||||
|
sessionId: authContext.sessionId,
|
||||||
|
userId: authContext.userId,
|
||||||
|
licenseType: authContext.license.licenseType,
|
||||||
|
message: 'Connected to Dexorder AI',
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Handle messages
|
||||||
|
socket.on('message', async (data: Buffer) => {
|
||||||
|
try {
|
||||||
|
const payload = JSON.parse(data.toString());
|
||||||
|
|
||||||
|
if (payload.type === 'message') {
|
||||||
|
const inboundMessage: InboundMessage = {
|
||||||
|
messageId: randomUUID(),
|
||||||
|
userId: authContext.userId,
|
||||||
|
sessionId: authContext.sessionId,
|
||||||
|
content: payload.content,
|
||||||
|
attachments: payload.attachments,
|
||||||
|
timestamp: new Date(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const response = await harness.handleMessage(inboundMessage);
|
||||||
|
|
||||||
|
socket.send(
|
||||||
|
JSON.stringify({
|
||||||
|
type: 'message',
|
||||||
|
...response,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error }, 'Error handling WebSocket message');
|
||||||
|
socket.send(
|
||||||
|
JSON.stringify({
|
||||||
|
type: 'error',
|
||||||
|
message: 'Failed to process message',
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle disconnection
|
||||||
|
socket.on('close', async () => {
|
||||||
|
logger.info({ sessionId: authContext.sessionId }, 'WebSocket disconnected');
|
||||||
|
await harness.cleanup();
|
||||||
|
this.sessions.delete(authContext.sessionId);
|
||||||
|
});
|
||||||
|
|
||||||
|
socket.on('error', (error) => {
|
||||||
|
logger.error({ error, sessionId: authContext.sessionId }, 'WebSocket error');
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
logger.error({ error }, 'Failed to initialize agent harness');
|
||||||
|
socket.close(1011, 'Internal server error');
|
||||||
|
await harness.cleanup();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
107
gateway/src/db/user-service.ts
Normal file
107
gateway/src/db/user-service.ts
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
import { Pool, PoolClient } from 'pg';
|
||||||
|
import type { UserLicense } from '../types/user.js';
|
||||||
|
import { UserLicenseSchema } from '../types/user.js';
|
||||||
|
|
||||||
|
export class UserService {
|
||||||
|
private pool: Pool;
|
||||||
|
|
||||||
|
constructor(connectionString: string) {
|
||||||
|
this.pool = new Pool({
|
||||||
|
connectionString,
|
||||||
|
max: 20,
|
||||||
|
idleTimeoutMillis: 30000,
|
||||||
|
connectionTimeoutMillis: 2000,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get user license by user ID
|
||||||
|
*/
|
||||||
|
async getUserLicense(userId: string): Promise<UserLicense | null> {
|
||||||
|
const client = await this.pool.connect();
|
||||||
|
try {
|
||||||
|
const result = await client.query(
|
||||||
|
`SELECT
|
||||||
|
user_id as "userId",
|
||||||
|
email,
|
||||||
|
license_type as "licenseType",
|
||||||
|
features,
|
||||||
|
resource_limits as "resourceLimits",
|
||||||
|
mcp_server_url as "mcpServerUrl",
|
||||||
|
preferred_model as "preferredModel",
|
||||||
|
expires_at as "expiresAt",
|
||||||
|
created_at as "createdAt",
|
||||||
|
updated_at as "updatedAt"
|
||||||
|
FROM user_licenses
|
||||||
|
WHERE user_id = $1
|
||||||
|
AND (expires_at IS NULL OR expires_at > NOW())`,
|
||||||
|
[userId]
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.rows.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const row = result.rows[0];
|
||||||
|
|
||||||
|
// Parse and validate
|
||||||
|
return UserLicenseSchema.parse({
|
||||||
|
userId: row.userId,
|
||||||
|
email: row.email,
|
||||||
|
licenseType: row.licenseType,
|
||||||
|
features: row.features,
|
||||||
|
resourceLimits: row.resourceLimits,
|
||||||
|
mcpServerUrl: row.mcpServerUrl,
|
||||||
|
preferredModel: row.preferredModel,
|
||||||
|
expiresAt: row.expiresAt,
|
||||||
|
createdAt: row.createdAt,
|
||||||
|
updatedAt: row.updatedAt,
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
client.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get user ID from channel-specific identifier
|
||||||
|
*/
|
||||||
|
async getUserIdFromChannel(channelType: string, channelUserId: string): Promise<string | null> {
|
||||||
|
const client = await this.pool.connect();
|
||||||
|
try {
|
||||||
|
const result = await client.query(
|
||||||
|
`SELECT user_id
|
||||||
|
FROM user_channel_links
|
||||||
|
WHERE channel_type = $1 AND channel_user_id = $2`,
|
||||||
|
[channelType, channelUserId]
|
||||||
|
);
|
||||||
|
|
||||||
|
return result.rows.length > 0 ? result.rows[0].user_id : null;
|
||||||
|
} finally {
|
||||||
|
client.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verify JWT token from web client
|
||||||
|
* TODO: Implement JWT verification with JWKS
|
||||||
|
*/
|
||||||
|
async verifyWebToken(token: string): Promise<string | null> {
|
||||||
|
// Placeholder - implement JWT verification
|
||||||
|
// For now, decode without verification (INSECURE - FOR DEV ONLY)
|
||||||
|
try {
|
||||||
|
const payload = JSON.parse(
|
||||||
|
Buffer.from(token.split('.')[1], 'base64').toString()
|
||||||
|
);
|
||||||
|
return payload.sub || null;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close database pool
|
||||||
|
*/
|
||||||
|
async close(): Promise<void> {
|
||||||
|
await this.pool.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
306
gateway/src/harness/agent-harness.ts
Normal file
306
gateway/src/harness/agent-harness.ts
Normal file
@@ -0,0 +1,306 @@
|
|||||||
|
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||||
|
import type { BaseMessage } from '@langchain/core/messages';
|
||||||
|
import { HumanMessage, AIMessage, SystemMessage } from '@langchain/core/messages';
|
||||||
|
import type { FastifyBaseLogger } from 'fastify';
|
||||||
|
import type { UserLicense } from '../types/user.js';
|
||||||
|
import type { InboundMessage, OutboundMessage } from '../types/messages.js';
|
||||||
|
import { MCPClientConnector } from './mcp-client.js';
|
||||||
|
import { CONTEXT_URIS, type ResourceContent } from '../types/resources.js';
|
||||||
|
import { LLMProviderFactory, type ProviderConfig } from '../llm/provider.js';
|
||||||
|
import { ModelRouter, RoutingStrategy } from '../llm/router.js';
|
||||||
|
|
||||||
|
export interface AgentHarnessConfig {
|
||||||
|
userId: string;
|
||||||
|
sessionId: string;
|
||||||
|
license: UserLicense;
|
||||||
|
providerConfig: ProviderConfig;
|
||||||
|
logger: FastifyBaseLogger;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Agent harness orchestrates between LLM and user's MCP server.
|
||||||
|
*
|
||||||
|
* This is a STATELESS orchestrator - all conversation history, RAG, and context
|
||||||
|
* lives in the user's MCP server container. The harness only:
|
||||||
|
* 1. Fetches context from user's MCP resources
|
||||||
|
* 2. Routes to appropriate LLM model
|
||||||
|
* 3. Calls LLM with embedded context
|
||||||
|
* 4. Routes tool calls to user's MCP or platform tools
|
||||||
|
* 5. Saves messages back to user's MCP
|
||||||
|
*/
|
||||||
|
export class AgentHarness {
|
||||||
|
private config: AgentHarnessConfig;
|
||||||
|
private modelFactory: LLMProviderFactory;
|
||||||
|
private modelRouter: ModelRouter;
|
||||||
|
private mcpClient: MCPClientConnector;
|
||||||
|
|
||||||
|
constructor(config: AgentHarnessConfig) {
|
||||||
|
this.config = config;
|
||||||
|
|
||||||
|
this.modelFactory = new LLMProviderFactory(config.providerConfig, config.logger);
|
||||||
|
this.modelRouter = new ModelRouter(this.modelFactory, config.logger);
|
||||||
|
|
||||||
|
this.mcpClient = new MCPClientConnector({
|
||||||
|
userId: config.userId,
|
||||||
|
mcpServerUrl: config.license.mcpServerUrl,
|
||||||
|
logger: config.logger,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize harness and connect to user's MCP server
|
||||||
|
*/
|
||||||
|
async initialize(): Promise<void> {
|
||||||
|
this.config.logger.info(
|
||||||
|
{ userId: this.config.userId, sessionId: this.config.sessionId },
|
||||||
|
'Initializing agent harness'
|
||||||
|
);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this.mcpClient.connect();
|
||||||
|
this.config.logger.info('Agent harness initialized');
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error }, 'Failed to initialize agent harness');
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handle incoming message from user
|
||||||
|
*/
|
||||||
|
async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
|
||||||
|
this.config.logger.info(
|
||||||
|
{ messageId: message.messageId, userId: message.userId },
|
||||||
|
'Processing user message'
|
||||||
|
);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// 1. Fetch context resources from user's MCP server
|
||||||
|
this.config.logger.debug('Fetching context resources from MCP');
|
||||||
|
const contextResources = await this.fetchContextResources();
|
||||||
|
|
||||||
|
// 2. Build system prompt from resources
|
||||||
|
const systemPrompt = this.buildSystemPrompt(contextResources);
|
||||||
|
|
||||||
|
// 3. Build messages with conversation context from MCP
|
||||||
|
const messages = this.buildMessages(message, contextResources);
|
||||||
|
|
||||||
|
// 4. Route to appropriate model
|
||||||
|
const model = await this.modelRouter.route(
|
||||||
|
message.content,
|
||||||
|
this.config.license,
|
||||||
|
RoutingStrategy.COMPLEXITY
|
||||||
|
);
|
||||||
|
|
||||||
|
// 5. Build LangChain messages
|
||||||
|
const langchainMessages = this.buildLangChainMessages(systemPrompt, messages);
|
||||||
|
|
||||||
|
// 6. Call LLM with streaming
|
||||||
|
this.config.logger.debug('Invoking LLM');
|
||||||
|
const response = await model.invoke(langchainMessages);
|
||||||
|
|
||||||
|
// 7. Extract text response (tool handling TODO)
|
||||||
|
const assistantMessage = response.content as string;
|
||||||
|
|
||||||
|
// 8. Save messages to user's MCP server
|
||||||
|
this.config.logger.debug('Saving messages to MCP');
|
||||||
|
await this.mcpClient.callTool('save_message', {
|
||||||
|
role: 'user',
|
||||||
|
content: message.content,
|
||||||
|
timestamp: message.timestamp.toISOString(),
|
||||||
|
});
|
||||||
|
await this.mcpClient.callTool('save_message', {
|
||||||
|
role: 'assistant',
|
||||||
|
content: assistantMessage,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
messageId: `msg_${Date.now()}`,
|
||||||
|
sessionId: message.sessionId,
|
||||||
|
content: assistantMessage,
|
||||||
|
timestamp: new Date(),
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error }, 'Error processing message');
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stream response from LLM
|
||||||
|
*/
|
||||||
|
async *streamMessage(message: InboundMessage): AsyncGenerator<string> {
|
||||||
|
try {
|
||||||
|
// Fetch context
|
||||||
|
const contextResources = await this.fetchContextResources();
|
||||||
|
const systemPrompt = this.buildSystemPrompt(contextResources);
|
||||||
|
const messages = this.buildMessages(message, contextResources);
|
||||||
|
|
||||||
|
// Route to model
|
||||||
|
const model = await this.modelRouter.route(
|
||||||
|
message.content,
|
||||||
|
this.config.license,
|
||||||
|
RoutingStrategy.COMPLEXITY
|
||||||
|
);
|
||||||
|
|
||||||
|
// Build messages
|
||||||
|
const langchainMessages = this.buildLangChainMessages(systemPrompt, messages);
|
||||||
|
|
||||||
|
// Stream response
|
||||||
|
const stream = await model.stream(langchainMessages);
|
||||||
|
|
||||||
|
let fullResponse = '';
|
||||||
|
for await (const chunk of stream) {
|
||||||
|
const content = chunk.content as string;
|
||||||
|
fullResponse += content;
|
||||||
|
yield content;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save after streaming completes
|
||||||
|
await this.mcpClient.callTool('save_message', {
|
||||||
|
role: 'user',
|
||||||
|
content: message.content,
|
||||||
|
timestamp: message.timestamp.toISOString(),
|
||||||
|
});
|
||||||
|
await this.mcpClient.callTool('save_message', {
|
||||||
|
role: 'assistant',
|
||||||
|
content: fullResponse,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error }, 'Error streaming message');
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch context resources from user's MCP server
|
||||||
|
*/
|
||||||
|
private async fetchContextResources(): Promise<ResourceContent[]> {
|
||||||
|
const contextUris = [
|
||||||
|
CONTEXT_URIS.USER_PROFILE,
|
||||||
|
CONTEXT_URIS.CONVERSATION_SUMMARY,
|
||||||
|
CONTEXT_URIS.WORKSPACE_STATE,
|
||||||
|
CONTEXT_URIS.SYSTEM_PROMPT,
|
||||||
|
];
|
||||||
|
|
||||||
|
const resources = await Promise.all(
|
||||||
|
contextUris.map(async (uri) => {
|
||||||
|
try {
|
||||||
|
return await this.mcpClient.readResource(uri);
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.warn({ error, uri }, 'Failed to fetch resource, using empty');
|
||||||
|
return { uri, text: '' };
|
||||||
|
}
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
return resources;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build messages array with context from resources
|
||||||
|
*/
|
||||||
|
private buildMessages(
|
||||||
|
currentMessage: InboundMessage,
|
||||||
|
contextResources: ResourceContent[]
|
||||||
|
): Array<{ role: string; content: string }> {
|
||||||
|
const conversationSummary = contextResources.find(
|
||||||
|
(r) => r.uri === CONTEXT_URIS.CONVERSATION_SUMMARY
|
||||||
|
);
|
||||||
|
|
||||||
|
const messages: Array<{ role: string; content: string }> = [];
|
||||||
|
|
||||||
|
// Add conversation context as a system-like user message
|
||||||
|
if (conversationSummary?.text) {
|
||||||
|
messages.push({
|
||||||
|
role: 'user',
|
||||||
|
content: `[Previous Conversation Context]\n${conversationSummary.text}`,
|
||||||
|
});
|
||||||
|
messages.push({
|
||||||
|
role: 'assistant',
|
||||||
|
content: 'I understand the context from our previous conversations.',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add current user message
|
||||||
|
messages.push({
|
||||||
|
role: 'user',
|
||||||
|
content: currentMessage.content,
|
||||||
|
});
|
||||||
|
|
||||||
|
return messages;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert to LangChain message format
|
||||||
|
*/
|
||||||
|
private buildLangChainMessages(
|
||||||
|
systemPrompt: string,
|
||||||
|
messages: Array<{ role: string; content: string }>
|
||||||
|
): BaseMessage[] {
|
||||||
|
const langchainMessages: BaseMessage[] = [new SystemMessage(systemPrompt)];
|
||||||
|
|
||||||
|
for (const msg of messages) {
|
||||||
|
if (msg.role === 'user') {
|
||||||
|
langchainMessages.push(new HumanMessage(msg.content));
|
||||||
|
} else if (msg.role === 'assistant') {
|
||||||
|
langchainMessages.push(new AIMessage(msg.content));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return langchainMessages;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build system prompt from platform base + user resources
|
||||||
|
*/
|
||||||
|
private buildSystemPrompt(contextResources: ResourceContent[]): string {
|
||||||
|
const userProfile = contextResources.find((r) => r.uri === CONTEXT_URIS.USER_PROFILE);
|
||||||
|
const customPrompt = contextResources.find((r) => r.uri === CONTEXT_URIS.SYSTEM_PROMPT);
|
||||||
|
const workspaceState = contextResources.find((r) => r.uri === CONTEXT_URIS.WORKSPACE_STATE);
|
||||||
|
|
||||||
|
// Base platform prompt
|
||||||
|
let prompt = `You are a helpful AI assistant for Dexorder, an AI-first trading platform.
|
||||||
|
You help users research markets, develop indicators and strategies, and analyze trading data.
|
||||||
|
|
||||||
|
User license: ${this.config.license.licenseType}
|
||||||
|
Available features: ${JSON.stringify(this.config.license.features, null, 2)}`;
|
||||||
|
|
||||||
|
// Add user profile context
|
||||||
|
if (userProfile?.text) {
|
||||||
|
prompt += `\n\n# User Profile\n${userProfile.text}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add workspace context
|
||||||
|
if (workspaceState?.text) {
|
||||||
|
prompt += `\n\n# Current Workspace\n${workspaceState.text}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add user's custom instructions (highest priority)
|
||||||
|
if (customPrompt?.text) {
|
||||||
|
prompt += `\n\n# User Instructions\n${customPrompt.text}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get platform tools (non-user-specific tools)
|
||||||
|
*/
|
||||||
|
private getPlatformTools(): Array<{ name: string; description?: string }> {
|
||||||
|
// Platform tools that don't need user's MCP
|
||||||
|
return [
|
||||||
|
// TODO: Add platform tools like market data queries, chart rendering, etc.
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleanup resources
|
||||||
|
*/
|
||||||
|
async cleanup(): Promise<void> {
|
||||||
|
this.config.logger.info('Cleaning up agent harness');
|
||||||
|
await this.mcpClient.disconnect();
|
||||||
|
}
|
||||||
|
}
|
||||||
259
gateway/src/harness/mcp-client.ts
Normal file
259
gateway/src/harness/mcp-client.ts
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
||||||
|
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
|
||||||
|
import type { FastifyBaseLogger } from 'fastify';
|
||||||
|
|
||||||
|
export interface MCPClientConfig {
|
||||||
|
userId: string;
|
||||||
|
mcpServerUrl: string;
|
||||||
|
platformJWT?: string;
|
||||||
|
logger: FastifyBaseLogger;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MCP client connector for user's container
|
||||||
|
* Manages connection to user-specific MCP server
|
||||||
|
*/
|
||||||
|
export class MCPClientConnector {
|
||||||
|
private client: Client | null = null;
|
||||||
|
private connected = false;
|
||||||
|
private config: MCPClientConfig;
|
||||||
|
|
||||||
|
constructor(config: MCPClientConfig) {
|
||||||
|
this.config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Connect to user's MCP server
|
||||||
|
* TODO: Implement HTTP/SSE transport instead of stdio for container communication
|
||||||
|
*/
|
||||||
|
async connect(): Promise<void> {
|
||||||
|
if (this.connected) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
this.config.logger.info(
|
||||||
|
{ userId: this.config.userId, url: this.config.mcpServerUrl },
|
||||||
|
'Connecting to user MCP server'
|
||||||
|
);
|
||||||
|
|
||||||
|
this.client = new Client(
|
||||||
|
{
|
||||||
|
name: 'dexorder-gateway',
|
||||||
|
version: '0.1.0',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
capabilities: {
|
||||||
|
tools: {},
|
||||||
|
resources: {},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// TODO: Replace with HTTP transport when user containers are ready
|
||||||
|
// For now, this is a placeholder structure
|
||||||
|
// const transport = new HTTPTransport(this.config.mcpServerUrl, {
|
||||||
|
// headers: {
|
||||||
|
// 'Authorization': `Bearer ${this.config.platformJWT}`
|
||||||
|
// }
|
||||||
|
// });
|
||||||
|
|
||||||
|
// Placeholder: will be replaced with actual container transport
|
||||||
|
this.config.logger.warn(
|
||||||
|
'MCP transport not yet implemented - using placeholder'
|
||||||
|
);
|
||||||
|
|
||||||
|
this.connected = true;
|
||||||
|
this.config.logger.info('Connected to user MCP server');
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error(
|
||||||
|
{ error, userId: this.config.userId },
|
||||||
|
'Failed to connect to user MCP server'
|
||||||
|
);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call a tool on the user's MCP server
|
||||||
|
*/
|
||||||
|
async callTool(name: string, args: Record<string, unknown>): Promise<unknown> {
|
||||||
|
if (!this.client || !this.connected) {
|
||||||
|
throw new Error('MCP client not connected');
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
this.config.logger.debug({ tool: name, args }, 'Calling MCP tool');
|
||||||
|
|
||||||
|
// TODO: Implement when MCP client is connected
|
||||||
|
// const result = await this.client.callTool({ name, arguments: args });
|
||||||
|
// return result;
|
||||||
|
|
||||||
|
// Placeholder response
|
||||||
|
return { success: true, message: 'MCP tool call placeholder' };
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error, tool: name }, 'MCP tool call failed');
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List available tools from user's MCP server
|
||||||
|
*/
|
||||||
|
async listTools(): Promise<Array<{ name: string; description?: string }>> {
|
||||||
|
if (!this.client || !this.connected) {
|
||||||
|
throw new Error('MCP client not connected');
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// TODO: Implement when MCP client is connected
|
||||||
|
// const tools = await this.client.listTools();
|
||||||
|
// return tools;
|
||||||
|
|
||||||
|
// Placeholder tools (actions only, not context)
|
||||||
|
return [
|
||||||
|
{ name: 'save_message', description: 'Save message to conversation history' },
|
||||||
|
{ name: 'list_strategies', description: 'List user strategies' },
|
||||||
|
{ name: 'read_strategy', description: 'Read strategy code' },
|
||||||
|
{ name: 'write_strategy', description: 'Write strategy code' },
|
||||||
|
{ name: 'run_backtest', description: 'Run backtest on strategy' },
|
||||||
|
{ name: 'get_watchlist', description: 'Get user watchlist' },
|
||||||
|
{ name: 'execute_trade', description: 'Execute trade' },
|
||||||
|
];
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error }, 'Failed to list MCP tools');
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List available resources from user's MCP server
|
||||||
|
*/
|
||||||
|
async listResources(): Promise<Array<{ uri: string; name: string; description?: string; mimeType?: string }>> {
|
||||||
|
if (!this.client || !this.connected) {
|
||||||
|
throw new Error('MCP client not connected');
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// TODO: Implement when MCP client is connected
|
||||||
|
// const resources = await this.client.listResources();
|
||||||
|
// return resources;
|
||||||
|
|
||||||
|
// Placeholder resources for user context
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
uri: 'context://user-profile',
|
||||||
|
name: 'User Profile',
|
||||||
|
description: 'User trading style, preferences, and background',
|
||||||
|
mimeType: 'text/plain',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
uri: 'context://conversation-summary',
|
||||||
|
name: 'Conversation Summary',
|
||||||
|
description: 'Semantic summary of recent conversation history with RAG',
|
||||||
|
mimeType: 'text/plain',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
uri: 'context://workspace-state',
|
||||||
|
name: 'Workspace State',
|
||||||
|
description: 'Current chart, watchlist, and open positions',
|
||||||
|
mimeType: 'application/json',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
uri: 'context://system-prompt',
|
||||||
|
name: 'Custom System Prompt',
|
||||||
|
description: 'User custom instructions for the assistant',
|
||||||
|
mimeType: 'text/plain',
|
||||||
|
},
|
||||||
|
];
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error }, 'Failed to list MCP resources');
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read a resource from user's MCP server
|
||||||
|
*/
|
||||||
|
async readResource(uri: string): Promise<{ uri: string; mimeType?: string; text?: string; blob?: string }> {
|
||||||
|
if (!this.client || !this.connected) {
|
||||||
|
throw new Error('MCP client not connected');
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
this.config.logger.debug({ uri }, 'Reading MCP resource');
|
||||||
|
|
||||||
|
// TODO: Implement when MCP client is connected
|
||||||
|
// const resource = await this.client.readResource({ uri });
|
||||||
|
// return resource;
|
||||||
|
|
||||||
|
// Placeholder resource content
|
||||||
|
if (uri === 'context://user-profile') {
|
||||||
|
return {
|
||||||
|
uri,
|
||||||
|
mimeType: 'text/plain',
|
||||||
|
text: `User Profile:
|
||||||
|
- Trading experience: Intermediate
|
||||||
|
- Preferred timeframes: 1h, 4h, 1d
|
||||||
|
- Risk tolerance: Medium
|
||||||
|
- Focus: Swing trading with technical indicators`,
|
||||||
|
};
|
||||||
|
} else if (uri === 'context://conversation-summary') {
|
||||||
|
return {
|
||||||
|
uri,
|
||||||
|
mimeType: 'text/plain',
|
||||||
|
text: `Recent Conversation Summary:
|
||||||
|
[RAG-generated summary would go here]
|
||||||
|
|
||||||
|
User recently discussed:
|
||||||
|
- Moving average crossover strategies
|
||||||
|
- Backtesting on BTC/USDT
|
||||||
|
- Risk management techniques`,
|
||||||
|
};
|
||||||
|
} else if (uri === 'context://workspace-state') {
|
||||||
|
return {
|
||||||
|
uri,
|
||||||
|
mimeType: 'application/json',
|
||||||
|
text: JSON.stringify({
|
||||||
|
currentChart: { ticker: 'BINANCE:BTC/USDT', timeframe: '1h' },
|
||||||
|
watchlist: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'],
|
||||||
|
openPositions: [],
|
||||||
|
}, null, 2),
|
||||||
|
};
|
||||||
|
} else if (uri === 'context://system-prompt') {
|
||||||
|
return {
|
||||||
|
uri,
|
||||||
|
mimeType: 'text/plain',
|
||||||
|
text: `Custom Instructions:
|
||||||
|
- Be concise and data-driven
|
||||||
|
- Always show risk/reward ratios
|
||||||
|
- Prefer simple strategies over complex ones`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return { uri, text: '' };
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error, uri }, 'MCP resource read failed');
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Disconnect from MCP server
|
||||||
|
*/
|
||||||
|
async disconnect(): Promise<void> {
|
||||||
|
if (this.client && this.connected) {
|
||||||
|
try {
|
||||||
|
await this.client.close();
|
||||||
|
this.connected = false;
|
||||||
|
this.config.logger.info('Disconnected from user MCP server');
|
||||||
|
} catch (error) {
|
||||||
|
this.config.logger.error({ error }, 'Error disconnecting from MCP server');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
isConnected(): boolean {
|
||||||
|
return this.connected;
|
||||||
|
}
|
||||||
|
}
|
||||||
327
gateway/src/k8s/client.ts
Normal file
327
gateway/src/k8s/client.ts
Normal file
@@ -0,0 +1,327 @@
|
|||||||
|
import * as k8s from '@kubernetes/client-node';
|
||||||
|
import type { FastifyBaseLogger } from 'fastify';
|
||||||
|
import * as yaml from 'js-yaml';
|
||||||
|
import * as fs from 'fs/promises';
|
||||||
|
import * as path from 'path';
|
||||||
|
import { fileURLToPath } from 'url';
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
const __dirname = path.dirname(__filename);
|
||||||
|
|
||||||
|
export interface K8sClientConfig {
|
||||||
|
namespace: string;
|
||||||
|
inCluster: boolean;
|
||||||
|
context?: string; // For local dev
|
||||||
|
logger: FastifyBaseLogger;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DeploymentSpec {
|
||||||
|
userId: string;
|
||||||
|
licenseType: 'free' | 'pro' | 'enterprise';
|
||||||
|
agentImage: string;
|
||||||
|
sidecarImage: string;
|
||||||
|
storageClass: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kubernetes client wrapper for managing agent deployments
|
||||||
|
*/
|
||||||
|
export class KubernetesClient {
|
||||||
|
private config: K8sClientConfig;
|
||||||
|
private k8sConfig: k8s.KubeConfig;
|
||||||
|
private appsApi: k8s.AppsV1Api;
|
||||||
|
private coreApi: k8s.CoreV1Api;
|
||||||
|
|
||||||
|
constructor(config: K8sClientConfig) {
|
||||||
|
this.config = config;
|
||||||
|
this.k8sConfig = new k8s.KubeConfig();
|
||||||
|
|
||||||
|
if (config.inCluster) {
|
||||||
|
this.k8sConfig.loadFromCluster();
|
||||||
|
this.config.logger.info('Loaded in-cluster Kubernetes config');
|
||||||
|
} else {
|
||||||
|
this.k8sConfig.loadFromDefault();
|
||||||
|
if (config.context) {
|
||||||
|
this.k8sConfig.setCurrentContext(config.context);
|
||||||
|
this.config.logger.info({ context: config.context }, 'Set Kubernetes context');
|
||||||
|
}
|
||||||
|
this.config.logger.info('Loaded Kubernetes config from default location');
|
||||||
|
}
|
||||||
|
|
||||||
|
this.appsApi = this.k8sConfig.makeApiClient(k8s.AppsV1Api);
|
||||||
|
this.coreApi = this.k8sConfig.makeApiClient(k8s.CoreV1Api);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate deployment name from user ID
|
||||||
|
*/
|
||||||
|
static getDeploymentName(userId: string): string {
|
||||||
|
// Sanitize userId to be k8s-compliant (lowercase alphanumeric + hyphens)
|
||||||
|
const sanitized = userId.toLowerCase().replace(/[^a-z0-9-]/g, '-');
|
||||||
|
return `agent-${sanitized}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate service name (same as deployment)
|
||||||
|
*/
|
||||||
|
static getServiceName(userId: string): string {
|
||||||
|
return this.getDeploymentName(userId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate PVC name
|
||||||
|
*/
|
||||||
|
static getPvcName(userId: string): string {
|
||||||
|
return `${this.getDeploymentName(userId)}-data`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute MCP endpoint URL from service name
|
||||||
|
*/
|
||||||
|
static getMcpEndpoint(userId: string, namespace: string): string {
|
||||||
|
const serviceName = this.getServiceName(userId);
|
||||||
|
return `http://${serviceName}.${namespace}.svc.cluster.local:3000`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if deployment exists
|
||||||
|
*/
|
||||||
|
async deploymentExists(deploymentName: string): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
await this.appsApi.readNamespacedDeployment(deploymentName, this.config.namespace);
|
||||||
|
return true;
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.statusCode === 404) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create agent deployment from template
|
||||||
|
*/
|
||||||
|
async createAgentDeployment(spec: DeploymentSpec): Promise<void> {
|
||||||
|
const deploymentName = KubernetesClient.getDeploymentName(spec.userId);
|
||||||
|
const serviceName = KubernetesClient.getServiceName(spec.userId);
|
||||||
|
const pvcName = KubernetesClient.getPvcName(spec.userId);
|
||||||
|
|
||||||
|
this.config.logger.info(
|
||||||
|
{ userId: spec.userId, licenseType: spec.licenseType, deploymentName },
|
||||||
|
'Creating agent deployment'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Load template based on license type
|
||||||
|
const templatePath = path.join(
|
||||||
|
__dirname,
|
||||||
|
'templates',
|
||||||
|
`${spec.licenseType}-tier.yaml`
|
||||||
|
);
|
||||||
|
|
||||||
|
const templateContent = await fs.readFile(templatePath, 'utf-8');
|
||||||
|
|
||||||
|
// Substitute variables
|
||||||
|
const rendered = templateContent
|
||||||
|
.replace(/\{\{userId\}\}/g, spec.userId)
|
||||||
|
.replace(/\{\{deploymentName\}\}/g, deploymentName)
|
||||||
|
.replace(/\{\{serviceName\}\}/g, serviceName)
|
||||||
|
.replace(/\{\{pvcName\}\}/g, pvcName)
|
||||||
|
.replace(/\{\{agentImage\}\}/g, spec.agentImage)
|
||||||
|
.replace(/\{\{sidecarImage\}\}/g, spec.sidecarImage)
|
||||||
|
.replace(/\{\{storageClass\}\}/g, spec.storageClass);
|
||||||
|
|
||||||
|
// Parse YAML documents (deployment, pvc, service)
|
||||||
|
const documents = yaml.loadAll(rendered) as any[];
|
||||||
|
|
||||||
|
// Apply each resource
|
||||||
|
for (const doc of documents) {
|
||||||
|
if (!doc || !doc.kind) continue;
|
||||||
|
|
||||||
|
try {
|
||||||
|
switch (doc.kind) {
|
||||||
|
case 'Deployment':
|
||||||
|
await this.appsApi.createNamespacedDeployment(this.config.namespace, doc);
|
||||||
|
this.config.logger.info({ deploymentName }, 'Created deployment');
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'PersistentVolumeClaim':
|
||||||
|
await this.coreApi.createNamespacedPersistentVolumeClaim(
|
||||||
|
this.config.namespace,
|
||||||
|
doc
|
||||||
|
);
|
||||||
|
this.config.logger.info({ pvcName }, 'Created PVC');
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'Service':
|
||||||
|
await this.coreApi.createNamespacedService(this.config.namespace, doc);
|
||||||
|
this.config.logger.info({ serviceName }, 'Created service');
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
this.config.logger.warn({ kind: doc.kind }, 'Unknown resource kind in template');
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
// If resource already exists, log warning but continue
|
||||||
|
if (error.response?.statusCode === 409) {
|
||||||
|
this.config.logger.warn(
|
||||||
|
{ kind: doc.kind, name: doc.metadata?.name },
|
||||||
|
'Resource already exists, skipping'
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.config.logger.info({ deploymentName }, 'Agent deployment created successfully');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait for deployment to be ready
|
||||||
|
*/
|
||||||
|
async waitForDeploymentReady(
|
||||||
|
deploymentName: string,
|
||||||
|
timeoutMs: number = 120000
|
||||||
|
): Promise<boolean> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
const pollInterval = 2000; // 2 seconds
|
||||||
|
|
||||||
|
this.config.logger.info(
|
||||||
|
{ deploymentName, timeoutMs },
|
||||||
|
'Waiting for deployment to be ready'
|
||||||
|
);
|
||||||
|
|
||||||
|
while (Date.now() - startTime < timeoutMs) {
|
||||||
|
try {
|
||||||
|
const response = await this.appsApi.readNamespacedDeployment(
|
||||||
|
deploymentName,
|
||||||
|
this.config.namespace
|
||||||
|
);
|
||||||
|
|
||||||
|
const deployment = response.body;
|
||||||
|
const status = deployment.status;
|
||||||
|
|
||||||
|
// Check if deployment is ready
|
||||||
|
if (
|
||||||
|
status?.availableReplicas &&
|
||||||
|
status.availableReplicas > 0 &&
|
||||||
|
status.readyReplicas &&
|
||||||
|
status.readyReplicas > 0
|
||||||
|
) {
|
||||||
|
this.config.logger.info({ deploymentName }, 'Deployment is ready');
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for failure conditions
|
||||||
|
if (status?.conditions) {
|
||||||
|
const failedCondition = status.conditions.find(
|
||||||
|
(c) => c.type === 'Progressing' && c.status === 'False'
|
||||||
|
);
|
||||||
|
if (failedCondition) {
|
||||||
|
this.config.logger.error(
|
||||||
|
{ deploymentName, reason: failedCondition.reason, message: failedCondition.message },
|
||||||
|
'Deployment failed to progress'
|
||||||
|
);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.config.logger.debug(
|
||||||
|
{
|
||||||
|
deploymentName,
|
||||||
|
replicas: status?.replicas,
|
||||||
|
ready: status?.readyReplicas,
|
||||||
|
available: status?.availableReplicas,
|
||||||
|
},
|
||||||
|
'Deployment not ready yet, waiting...'
|
||||||
|
);
|
||||||
|
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.statusCode === 404) {
|
||||||
|
this.config.logger.warn({ deploymentName }, 'Deployment not found');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.config.logger.warn({ deploymentName, timeoutMs }, 'Deployment readiness timeout');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get service endpoint URL
|
||||||
|
*/
|
||||||
|
async getServiceEndpoint(serviceName: string): Promise<string | null> {
|
||||||
|
try {
|
||||||
|
const response = await this.coreApi.readNamespacedService(
|
||||||
|
serviceName,
|
||||||
|
this.config.namespace
|
||||||
|
);
|
||||||
|
|
||||||
|
const service = response.body;
|
||||||
|
|
||||||
|
// For ClusterIP services, return internal DNS name
|
||||||
|
if (service.spec?.type === 'ClusterIP') {
|
||||||
|
const port = service.spec.ports?.find((p) => p.name === 'mcp')?.port || 3000;
|
||||||
|
return `http://${serviceName}.${this.config.namespace}.svc.cluster.local:${port}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For other service types (NodePort, LoadBalancer), would need different logic
|
||||||
|
this.config.logger.warn(
|
||||||
|
{ serviceName, type: service.spec?.type },
|
||||||
|
'Unexpected service type'
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.statusCode === 404) {
|
||||||
|
this.config.logger.warn({ serviceName }, 'Service not found');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete deployment and associated resources
|
||||||
|
* (Used for cleanup/testing - normally handled by lifecycle sidecar)
|
||||||
|
*/
|
||||||
|
async deleteAgentDeployment(userId: string): Promise<void> {
|
||||||
|
const deploymentName = KubernetesClient.getDeploymentName(userId);
|
||||||
|
const serviceName = KubernetesClient.getServiceName(userId);
|
||||||
|
const pvcName = KubernetesClient.getPvcName(userId);
|
||||||
|
|
||||||
|
this.config.logger.info({ userId, deploymentName }, 'Deleting agent deployment');
|
||||||
|
|
||||||
|
// Delete deployment
|
||||||
|
try {
|
||||||
|
await this.appsApi.deleteNamespacedDeployment(deploymentName, this.config.namespace);
|
||||||
|
this.config.logger.info({ deploymentName }, 'Deleted deployment');
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.statusCode !== 404) {
|
||||||
|
this.config.logger.warn({ deploymentName, error }, 'Failed to delete deployment');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete service
|
||||||
|
try {
|
||||||
|
await this.coreApi.deleteNamespacedService(serviceName, this.config.namespace);
|
||||||
|
this.config.logger.info({ serviceName }, 'Deleted service');
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.statusCode !== 404) {
|
||||||
|
this.config.logger.warn({ serviceName, error }, 'Failed to delete service');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete PVC
|
||||||
|
try {
|
||||||
|
await this.coreApi.deleteNamespacedPersistentVolumeClaim(pvcName, this.config.namespace);
|
||||||
|
this.config.logger.info({ pvcName }, 'Deleted PVC');
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response?.statusCode !== 404) {
|
||||||
|
this.config.logger.warn({ pvcName, error }, 'Failed to delete PVC');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
118
gateway/src/k8s/container-manager.ts
Normal file
118
gateway/src/k8s/container-manager.ts
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
import type { FastifyBaseLogger } from 'fastify';
|
||||||
|
import { KubernetesClient, type DeploymentSpec } from './client.js';
|
||||||
|
import type { UserLicense } from '../types/user.js';
|
||||||
|
|
||||||
|
export interface ContainerManagerConfig {
|
||||||
|
k8sClient: KubernetesClient;
|
||||||
|
agentImage: string;
|
||||||
|
sidecarImage: string;
|
||||||
|
storageClass: string;
|
||||||
|
namespace: string;
|
||||||
|
logger: FastifyBaseLogger;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ContainerStatus {
|
||||||
|
exists: boolean;
|
||||||
|
ready: boolean;
|
||||||
|
mcpEndpoint: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Container manager orchestrates agent container lifecycle
|
||||||
|
*/
|
||||||
|
export class ContainerManager {
|
||||||
|
private config: ContainerManagerConfig;
|
||||||
|
|
||||||
|
constructor(config: ContainerManagerConfig) {
|
||||||
|
this.config = config;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure user's container is running and ready
|
||||||
|
* Returns the MCP endpoint URL
|
||||||
|
*/
|
||||||
|
async ensureContainerRunning(
|
||||||
|
userId: string,
|
||||||
|
license: UserLicense
|
||||||
|
): Promise<{ mcpEndpoint: string; wasCreated: boolean }> {
|
||||||
|
const deploymentName = KubernetesClient.getDeploymentName(userId);
|
||||||
|
const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace);
|
||||||
|
|
||||||
|
this.config.logger.info(
|
||||||
|
{ userId, licenseType: license.licenseType, deploymentName },
|
||||||
|
'Ensuring container is running'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Check if deployment already exists
|
||||||
|
const exists = await this.config.k8sClient.deploymentExists(deploymentName);
|
||||||
|
|
||||||
|
if (exists) {
|
||||||
|
this.config.logger.info({ userId, deploymentName }, 'Container deployment already exists');
|
||||||
|
|
||||||
|
// Wait for it to be ready (in case it's starting up)
|
||||||
|
const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 30000);
|
||||||
|
|
||||||
|
if (!ready) {
|
||||||
|
this.config.logger.warn(
|
||||||
|
{ userId, deploymentName },
|
||||||
|
'Existing deployment not ready within timeout'
|
||||||
|
);
|
||||||
|
// Continue anyway - might be an image pull or other transient issue
|
||||||
|
}
|
||||||
|
|
||||||
|
return { mcpEndpoint, wasCreated: false };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create new deployment
|
||||||
|
this.config.logger.info({ userId, licenseType: license.licenseType }, 'Creating new container');
|
||||||
|
|
||||||
|
const spec: DeploymentSpec = {
|
||||||
|
userId,
|
||||||
|
licenseType: license.licenseType,
|
||||||
|
agentImage: this.config.agentImage,
|
||||||
|
sidecarImage: this.config.sidecarImage,
|
||||||
|
storageClass: this.config.storageClass,
|
||||||
|
};
|
||||||
|
|
||||||
|
await this.config.k8sClient.createAgentDeployment(spec);
|
||||||
|
|
||||||
|
// Wait for deployment to be ready
|
||||||
|
const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 120000);
|
||||||
|
|
||||||
|
if (!ready) {
|
||||||
|
throw new Error(
|
||||||
|
`Container deployment failed to become ready within timeout: ${deploymentName}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.config.logger.info({ userId, mcpEndpoint }, 'Container is ready');
|
||||||
|
|
||||||
|
return { mcpEndpoint, wasCreated: true };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check container status without creating it
|
||||||
|
*/
|
||||||
|
async getContainerStatus(userId: string): Promise<ContainerStatus> {
|
||||||
|
const deploymentName = KubernetesClient.getDeploymentName(userId);
|
||||||
|
const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace);
|
||||||
|
|
||||||
|
const exists = await this.config.k8sClient.deploymentExists(deploymentName);
|
||||||
|
|
||||||
|
if (!exists) {
|
||||||
|
return { exists: false, ready: false, mcpEndpoint };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if ready (with short timeout)
|
||||||
|
const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 5000);
|
||||||
|
|
||||||
|
return { exists: true, ready, mcpEndpoint };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete container (for cleanup/testing)
|
||||||
|
*/
|
||||||
|
async deleteContainer(userId: string): Promise<void> {
|
||||||
|
await this.config.k8sClient.deleteAgentDeployment(userId);
|
||||||
|
}
|
||||||
|
}
|
||||||
199
gateway/src/k8s/templates/enterprise-tier.yaml
Normal file
199
gateway/src/k8s/templates/enterprise-tier.yaml
Normal file
@@ -0,0 +1,199 @@
|
|||||||
|
# Enterprise tier agent deployment template
|
||||||
|
# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
|
||||||
|
# Enterprise: No idle shutdown, larger resources
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: {{deploymentName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: agent
|
||||||
|
app.kubernetes.io/component: user-agent
|
||||||
|
dexorder.io/component: agent
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/deployment: {{deploymentName}}
|
||||||
|
dexorder.io/license-tier: enterprise
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
dexorder.io/component: agent
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/deployment: {{deploymentName}}
|
||||||
|
dexorder.io/license-tier: enterprise
|
||||||
|
spec:
|
||||||
|
serviceAccountName: agent-lifecycle
|
||||||
|
shareProcessNamespace: true
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
|
||||||
|
containers:
|
||||||
|
- name: agent
|
||||||
|
image: {{agentImage}}
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "1Gi"
|
||||||
|
cpu: "500m"
|
||||||
|
limits:
|
||||||
|
memory: "4Gi"
|
||||||
|
cpu: "4000m"
|
||||||
|
|
||||||
|
env:
|
||||||
|
- name: USER_ID
|
||||||
|
value: {{userId}}
|
||||||
|
- name: IDLE_TIMEOUT_MINUTES
|
||||||
|
value: "0"
|
||||||
|
- name: IDLE_CHECK_INTERVAL_SECONDS
|
||||||
|
value: "60"
|
||||||
|
- name: ENABLE_IDLE_SHUTDOWN
|
||||||
|
value: "false"
|
||||||
|
- name: MCP_SERVER_PORT
|
||||||
|
value: "3000"
|
||||||
|
- name: ZMQ_CONTROL_PORT
|
||||||
|
value: "5555"
|
||||||
|
|
||||||
|
ports:
|
||||||
|
- name: mcp
|
||||||
|
containerPort: 3000
|
||||||
|
protocol: TCP
|
||||||
|
- name: zmq-control
|
||||||
|
containerPort: 5555
|
||||||
|
protocol: TCP
|
||||||
|
|
||||||
|
volumeMounts:
|
||||||
|
- name: agent-data
|
||||||
|
mountPath: /app/data
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
- name: shared-run
|
||||||
|
mountPath: /var/run/agent
|
||||||
|
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: mcp
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 5
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: mcp
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
|
||||||
|
- name: lifecycle-sidecar
|
||||||
|
image: {{sidecarImage}}
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "32Mi"
|
||||||
|
cpu: "10m"
|
||||||
|
limits:
|
||||||
|
memory: "64Mi"
|
||||||
|
cpu: "50m"
|
||||||
|
|
||||||
|
env:
|
||||||
|
- name: NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.namespace
|
||||||
|
- name: DEPLOYMENT_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.labels['dexorder.io/deployment']
|
||||||
|
- name: USER_TYPE
|
||||||
|
value: "enterprise"
|
||||||
|
- name: MAIN_CONTAINER_PID
|
||||||
|
value: "1"
|
||||||
|
|
||||||
|
volumeMounts:
|
||||||
|
- name: shared-run
|
||||||
|
mountPath: /var/run/agent
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- name: agent-data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: {{pvcName}}
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 512Mi
|
||||||
|
- name: shared-run
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 1Mi
|
||||||
|
|
||||||
|
restartPolicy: Always
|
||||||
|
terminationGracePeriodSeconds: 30
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: {{pvcName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/license-tier: enterprise
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 50Gi
|
||||||
|
storageClassName: {{storageClass}}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: {{serviceName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/license-tier: enterprise
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
ports:
|
||||||
|
- name: mcp
|
||||||
|
port: 3000
|
||||||
|
targetPort: mcp
|
||||||
|
protocol: TCP
|
||||||
|
- name: zmq-control
|
||||||
|
port: 5555
|
||||||
|
targetPort: zmq-control
|
||||||
|
protocol: TCP
|
||||||
198
gateway/src/k8s/templates/free-tier.yaml
Normal file
198
gateway/src/k8s/templates/free-tier.yaml
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
# Free tier agent deployment template
|
||||||
|
# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: {{deploymentName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: agent
|
||||||
|
app.kubernetes.io/component: user-agent
|
||||||
|
dexorder.io/component: agent
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/deployment: {{deploymentName}}
|
||||||
|
dexorder.io/license-tier: free
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
dexorder.io/component: agent
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/deployment: {{deploymentName}}
|
||||||
|
dexorder.io/license-tier: free
|
||||||
|
spec:
|
||||||
|
serviceAccountName: agent-lifecycle
|
||||||
|
shareProcessNamespace: true
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
|
||||||
|
containers:
|
||||||
|
- name: agent
|
||||||
|
image: {{agentImage}}
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "256Mi"
|
||||||
|
cpu: "100m"
|
||||||
|
limits:
|
||||||
|
memory: "512Mi"
|
||||||
|
cpu: "500m"
|
||||||
|
|
||||||
|
env:
|
||||||
|
- name: USER_ID
|
||||||
|
value: {{userId}}
|
||||||
|
- name: IDLE_TIMEOUT_MINUTES
|
||||||
|
value: "15"
|
||||||
|
- name: IDLE_CHECK_INTERVAL_SECONDS
|
||||||
|
value: "60"
|
||||||
|
- name: ENABLE_IDLE_SHUTDOWN
|
||||||
|
value: "true"
|
||||||
|
- name: MCP_SERVER_PORT
|
||||||
|
value: "3000"
|
||||||
|
- name: ZMQ_CONTROL_PORT
|
||||||
|
value: "5555"
|
||||||
|
|
||||||
|
ports:
|
||||||
|
- name: mcp
|
||||||
|
containerPort: 3000
|
||||||
|
protocol: TCP
|
||||||
|
- name: zmq-control
|
||||||
|
containerPort: 5555
|
||||||
|
protocol: TCP
|
||||||
|
|
||||||
|
volumeMounts:
|
||||||
|
- name: agent-data
|
||||||
|
mountPath: /app/data
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
- name: shared-run
|
||||||
|
mountPath: /var/run/agent
|
||||||
|
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: mcp
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 5
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: mcp
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
|
||||||
|
- name: lifecycle-sidecar
|
||||||
|
image: {{sidecarImage}}
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "32Mi"
|
||||||
|
cpu: "10m"
|
||||||
|
limits:
|
||||||
|
memory: "64Mi"
|
||||||
|
cpu: "50m"
|
||||||
|
|
||||||
|
env:
|
||||||
|
- name: NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.namespace
|
||||||
|
- name: DEPLOYMENT_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.labels['dexorder.io/deployment']
|
||||||
|
- name: USER_TYPE
|
||||||
|
value: "free"
|
||||||
|
- name: MAIN_CONTAINER_PID
|
||||||
|
value: "1"
|
||||||
|
|
||||||
|
volumeMounts:
|
||||||
|
- name: shared-run
|
||||||
|
mountPath: /var/run/agent
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- name: agent-data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: {{pvcName}}
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 128Mi
|
||||||
|
- name: shared-run
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 1Mi
|
||||||
|
|
||||||
|
restartPolicy: Always
|
||||||
|
terminationGracePeriodSeconds: 30
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: {{pvcName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/license-tier: free
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Gi
|
||||||
|
storageClassName: {{storageClass}}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: {{serviceName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/license-tier: free
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
ports:
|
||||||
|
- name: mcp
|
||||||
|
port: 3000
|
||||||
|
targetPort: mcp
|
||||||
|
protocol: TCP
|
||||||
|
- name: zmq-control
|
||||||
|
port: 5555
|
||||||
|
targetPort: zmq-control
|
||||||
|
protocol: TCP
|
||||||
198
gateway/src/k8s/templates/pro-tier.yaml
Normal file
198
gateway/src/k8s/templates/pro-tier.yaml
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
# Pro tier agent deployment template
|
||||||
|
# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: {{deploymentName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: agent
|
||||||
|
app.kubernetes.io/component: user-agent
|
||||||
|
dexorder.io/component: agent
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/deployment: {{deploymentName}}
|
||||||
|
dexorder.io/license-tier: pro
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
dexorder.io/component: agent
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/deployment: {{deploymentName}}
|
||||||
|
dexorder.io/license-tier: pro
|
||||||
|
spec:
|
||||||
|
serviceAccountName: agent-lifecycle
|
||||||
|
shareProcessNamespace: true
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
|
||||||
|
containers:
|
||||||
|
- name: agent
|
||||||
|
image: {{agentImage}}
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "512Mi"
|
||||||
|
cpu: "250m"
|
||||||
|
limits:
|
||||||
|
memory: "2Gi"
|
||||||
|
cpu: "2000m"
|
||||||
|
|
||||||
|
env:
|
||||||
|
- name: USER_ID
|
||||||
|
value: {{userId}}
|
||||||
|
- name: IDLE_TIMEOUT_MINUTES
|
||||||
|
value: "60"
|
||||||
|
- name: IDLE_CHECK_INTERVAL_SECONDS
|
||||||
|
value: "60"
|
||||||
|
- name: ENABLE_IDLE_SHUTDOWN
|
||||||
|
value: "true"
|
||||||
|
- name: MCP_SERVER_PORT
|
||||||
|
value: "3000"
|
||||||
|
- name: ZMQ_CONTROL_PORT
|
||||||
|
value: "5555"
|
||||||
|
|
||||||
|
ports:
|
||||||
|
- name: mcp
|
||||||
|
containerPort: 3000
|
||||||
|
protocol: TCP
|
||||||
|
- name: zmq-control
|
||||||
|
containerPort: 5555
|
||||||
|
protocol: TCP
|
||||||
|
|
||||||
|
volumeMounts:
|
||||||
|
- name: agent-data
|
||||||
|
mountPath: /app/data
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
- name: shared-run
|
||||||
|
mountPath: /var/run/agent
|
||||||
|
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: mcp
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 5
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: mcp
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
|
||||||
|
- name: lifecycle-sidecar
|
||||||
|
image: {{sidecarImage}}
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "32Mi"
|
||||||
|
cpu: "10m"
|
||||||
|
limits:
|
||||||
|
memory: "64Mi"
|
||||||
|
cpu: "50m"
|
||||||
|
|
||||||
|
env:
|
||||||
|
- name: NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.namespace
|
||||||
|
- name: DEPLOYMENT_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.labels['dexorder.io/deployment']
|
||||||
|
- name: USER_TYPE
|
||||||
|
value: "pro"
|
||||||
|
- name: MAIN_CONTAINER_PID
|
||||||
|
value: "1"
|
||||||
|
|
||||||
|
volumeMounts:
|
||||||
|
- name: shared-run
|
||||||
|
mountPath: /var/run/agent
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- name: agent-data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: {{pvcName}}
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 256Mi
|
||||||
|
- name: shared-run
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 1Mi
|
||||||
|
|
||||||
|
restartPolicy: Always
|
||||||
|
terminationGracePeriodSeconds: 30
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: {{pvcName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/license-tier: pro
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 10Gi
|
||||||
|
storageClassName: {{storageClass}}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: {{serviceName}}
|
||||||
|
namespace: dexorder-agents
|
||||||
|
labels:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
dexorder.io/license-tier: pro
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
dexorder.io/user-id: {{userId}}
|
||||||
|
ports:
|
||||||
|
- name: mcp
|
||||||
|
port: 3000
|
||||||
|
targetPort: mcp
|
||||||
|
protocol: TCP
|
||||||
|
- name: zmq-control
|
||||||
|
port: 5555
|
||||||
|
targetPort: zmq-control
|
||||||
|
protocol: TCP
|
||||||
216
gateway/src/llm/provider.ts
Normal file
216
gateway/src/llm/provider.ts
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||||
|
import { ChatAnthropic } from '@langchain/anthropic';
|
||||||
|
import { ChatOpenAI } from '@langchain/openai';
|
||||||
|
import { ChatGoogleGenerativeAI } from '@langchain/google-genai';
|
||||||
|
import { ChatOpenRouter } from '@langchain/openrouter';
|
||||||
|
import type { FastifyBaseLogger } from 'fastify';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Supported LLM providers
|
||||||
|
*/
|
||||||
|
export enum LLMProvider {
|
||||||
|
ANTHROPIC = 'anthropic',
|
||||||
|
OPENAI = 'openai',
|
||||||
|
GOOGLE = 'google',
|
||||||
|
OPENROUTER = 'openrouter',
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Model configuration
|
||||||
|
*/
|
||||||
|
export interface ModelConfig {
|
||||||
|
provider: LLMProvider;
|
||||||
|
model: string;
|
||||||
|
temperature?: number;
|
||||||
|
maxTokens?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provider configuration with API keys
|
||||||
|
*/
|
||||||
|
export interface ProviderConfig {
|
||||||
|
anthropicApiKey?: string;
|
||||||
|
openaiApiKey?: string;
|
||||||
|
googleApiKey?: string;
|
||||||
|
openrouterApiKey?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* LLM Provider factory
|
||||||
|
* Creates model instances with unified interface across providers
|
||||||
|
*/
|
||||||
|
export class LLMProviderFactory {
|
||||||
|
private config: ProviderConfig;
|
||||||
|
private logger: FastifyBaseLogger;
|
||||||
|
|
||||||
|
constructor(config: ProviderConfig, logger: FastifyBaseLogger) {
|
||||||
|
this.config = config;
|
||||||
|
this.logger = logger;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a chat model instance
|
||||||
|
*/
|
||||||
|
createModel(modelConfig: ModelConfig): BaseChatModel {
|
||||||
|
this.logger.debug(
|
||||||
|
{ provider: modelConfig.provider, model: modelConfig.model },
|
||||||
|
'Creating LLM model'
|
||||||
|
);
|
||||||
|
|
||||||
|
switch (modelConfig.provider) {
|
||||||
|
case LLMProvider.ANTHROPIC:
|
||||||
|
return this.createAnthropicModel(modelConfig);
|
||||||
|
|
||||||
|
case LLMProvider.OPENAI:
|
||||||
|
return this.createOpenAIModel(modelConfig);
|
||||||
|
|
||||||
|
case LLMProvider.GOOGLE:
|
||||||
|
return this.createGoogleModel(modelConfig);
|
||||||
|
|
||||||
|
case LLMProvider.OPENROUTER:
|
||||||
|
return this.createOpenRouterModel(modelConfig);
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new Error(`Unsupported provider: ${modelConfig.provider}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create Anthropic Claude model
|
||||||
|
*/
|
||||||
|
private createAnthropicModel(config: ModelConfig): ChatAnthropic {
|
||||||
|
if (!this.config.anthropicApiKey) {
|
||||||
|
throw new Error('Anthropic API key not configured');
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ChatAnthropic({
|
||||||
|
model: config.model,
|
||||||
|
temperature: config.temperature ?? 0.7,
|
||||||
|
maxTokens: config.maxTokens ?? 4096,
|
||||||
|
anthropicApiKey: this.config.anthropicApiKey,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create OpenAI GPT model
|
||||||
|
*/
|
||||||
|
private createOpenAIModel(config: ModelConfig): ChatOpenAI {
|
||||||
|
if (!this.config.openaiApiKey) {
|
||||||
|
throw new Error('OpenAI API key not configured');
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ChatOpenAI({
|
||||||
|
model: config.model,
|
||||||
|
temperature: config.temperature ?? 0.7,
|
||||||
|
maxTokens: config.maxTokens ?? 4096,
|
||||||
|
openAIApiKey: this.config.openaiApiKey,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create Google Gemini model
|
||||||
|
*/
|
||||||
|
private createGoogleModel(config: ModelConfig): ChatGoogleGenerativeAI {
|
||||||
|
if (!this.config.googleApiKey) {
|
||||||
|
throw new Error('Google API key not configured');
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ChatGoogleGenerativeAI({
|
||||||
|
model: config.model,
|
||||||
|
temperature: config.temperature ?? 0.7,
|
||||||
|
maxOutputTokens: config.maxTokens ?? 4096,
|
||||||
|
apiKey: this.config.googleApiKey,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create OpenRouter model (access to 300+ models)
|
||||||
|
*/
|
||||||
|
private createOpenRouterModel(config: ModelConfig): ChatOpenRouter {
|
||||||
|
if (!this.config.openrouterApiKey) {
|
||||||
|
throw new Error('OpenRouter API key not configured');
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ChatOpenRouter({
|
||||||
|
model: config.model,
|
||||||
|
temperature: config.temperature ?? 0.7,
|
||||||
|
maxTokens: config.maxTokens ?? 4096,
|
||||||
|
apiKey: this.config.openrouterApiKey,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get default model based on environment
|
||||||
|
*/
|
||||||
|
getDefaultModel(): ModelConfig {
|
||||||
|
// Check which API keys are available
|
||||||
|
if (this.config.anthropicApiKey) {
|
||||||
|
return {
|
||||||
|
provider: LLMProvider.ANTHROPIC,
|
||||||
|
model: 'claude-3-5-sonnet-20241022',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.config.openaiApiKey) {
|
||||||
|
return {
|
||||||
|
provider: LLMProvider.OPENAI,
|
||||||
|
model: 'gpt-4o',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.config.googleApiKey) {
|
||||||
|
return {
|
||||||
|
provider: LLMProvider.GOOGLE,
|
||||||
|
model: 'gemini-2.0-flash-exp',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.config.openrouterApiKey) {
|
||||||
|
return {
|
||||||
|
provider: LLMProvider.OPENROUTER,
|
||||||
|
model: 'anthropic/claude-3.5-sonnet',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error('No LLM API keys configured');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Predefined model configurations
|
||||||
|
*/
|
||||||
|
export const MODELS = {
|
||||||
|
// Anthropic
|
||||||
|
CLAUDE_SONNET: {
|
||||||
|
provider: LLMProvider.ANTHROPIC,
|
||||||
|
model: 'claude-3-5-sonnet-20241022',
|
||||||
|
},
|
||||||
|
CLAUDE_HAIKU: {
|
||||||
|
provider: LLMProvider.ANTHROPIC,
|
||||||
|
model: 'claude-3-5-haiku-20241022',
|
||||||
|
},
|
||||||
|
CLAUDE_OPUS: {
|
||||||
|
provider: LLMProvider.ANTHROPIC,
|
||||||
|
model: 'claude-3-opus-20240229',
|
||||||
|
},
|
||||||
|
|
||||||
|
// OpenAI
|
||||||
|
GPT4O: {
|
||||||
|
provider: LLMProvider.OPENAI,
|
||||||
|
model: 'gpt-4o',
|
||||||
|
},
|
||||||
|
GPT4O_MINI: {
|
||||||
|
provider: LLMProvider.OPENAI,
|
||||||
|
model: 'gpt-4o-mini',
|
||||||
|
},
|
||||||
|
|
||||||
|
// Google
|
||||||
|
GEMINI_2_FLASH: {
|
||||||
|
provider: LLMProvider.GOOGLE,
|
||||||
|
model: 'gemini-2.0-flash-exp',
|
||||||
|
},
|
||||||
|
GEMINI_PRO: {
|
||||||
|
provider: LLMProvider.GOOGLE,
|
||||||
|
model: 'gemini-1.5-pro',
|
||||||
|
},
|
||||||
|
} as const satisfies Record<string, ModelConfig>;
|
||||||
202
gateway/src/llm/router.ts
Normal file
202
gateway/src/llm/router.ts
Normal file
@@ -0,0 +1,202 @@
|
|||||||
|
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||||
|
import type { FastifyBaseLogger } from 'fastify';
|
||||||
|
import { LLMProviderFactory, type ModelConfig, LLMProvider } from './provider.js';
|
||||||
|
import type { UserLicense } from '../types/user.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Model routing strategies
|
||||||
|
*/
|
||||||
|
export enum RoutingStrategy {
|
||||||
|
/** Use user's preferred model from license */
|
||||||
|
USER_PREFERENCE = 'user_preference',
|
||||||
|
/** Route based on query complexity */
|
||||||
|
COMPLEXITY = 'complexity',
|
||||||
|
/** Route based on license tier */
|
||||||
|
LICENSE_TIER = 'license_tier',
|
||||||
|
/** Use cheapest available model */
|
||||||
|
COST_OPTIMIZED = 'cost_optimized',
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Model router
|
||||||
|
* Intelligently selects which model to use based on various factors
|
||||||
|
*/
|
||||||
|
export class ModelRouter {
|
||||||
|
private factory: LLMProviderFactory;
|
||||||
|
private logger: FastifyBaseLogger;
|
||||||
|
private defaultModel: ModelConfig;
|
||||||
|
|
||||||
|
constructor(factory: LLMProviderFactory, logger: FastifyBaseLogger) {
|
||||||
|
this.factory = factory;
|
||||||
|
this.logger = logger;
|
||||||
|
this.defaultModel = factory.getDefaultModel();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Route to appropriate model based on context
|
||||||
|
*/
|
||||||
|
async route(
|
||||||
|
message: string,
|
||||||
|
license: UserLicense,
|
||||||
|
strategy: RoutingStrategy = RoutingStrategy.USER_PREFERENCE
|
||||||
|
): Promise<BaseChatModel> {
|
||||||
|
let modelConfig: ModelConfig;
|
||||||
|
|
||||||
|
switch (strategy) {
|
||||||
|
case RoutingStrategy.USER_PREFERENCE:
|
||||||
|
modelConfig = this.routeByUserPreference(license);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case RoutingStrategy.COMPLEXITY:
|
||||||
|
modelConfig = this.routeByComplexity(message, license);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case RoutingStrategy.LICENSE_TIER:
|
||||||
|
modelConfig = this.routeByLicenseTier(license);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case RoutingStrategy.COST_OPTIMIZED:
|
||||||
|
modelConfig = this.routeByCost(license);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
modelConfig = this.defaultModel;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.info(
|
||||||
|
{
|
||||||
|
userId: license.userId,
|
||||||
|
strategy,
|
||||||
|
provider: modelConfig.provider,
|
||||||
|
model: modelConfig.model,
|
||||||
|
},
|
||||||
|
'Routing to model'
|
||||||
|
);
|
||||||
|
|
||||||
|
return this.factory.createModel(modelConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Route based on user's preferred model (if set in license)
|
||||||
|
*/
|
||||||
|
private routeByUserPreference(license: UserLicense): ModelConfig {
|
||||||
|
// Check if user has custom model preference
|
||||||
|
const preferredModel = (license as any).preferredModel as ModelConfig | undefined;
|
||||||
|
|
||||||
|
if (preferredModel && this.isModelAllowed(preferredModel, license)) {
|
||||||
|
return preferredModel;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to license tier default
|
||||||
|
return this.routeByLicenseTier(license);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Route based on query complexity
|
||||||
|
*/
|
||||||
|
private routeByComplexity(message: string, license: UserLicense): ModelConfig {
|
||||||
|
const isComplex = this.isComplexQuery(message);
|
||||||
|
|
||||||
|
if (license.licenseType === 'enterprise') {
|
||||||
|
// Enterprise users get best models for complex queries
|
||||||
|
return isComplex
|
||||||
|
? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-opus-20240229' }
|
||||||
|
: { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (license.licenseType === 'pro') {
|
||||||
|
// Pro users get good models
|
||||||
|
return isComplex
|
||||||
|
? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' }
|
||||||
|
: { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Free users get efficient models
|
||||||
|
return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Route based on license tier
|
||||||
|
*/
|
||||||
|
private routeByLicenseTier(license: UserLicense): ModelConfig {
|
||||||
|
switch (license.licenseType) {
|
||||||
|
case 'enterprise':
|
||||||
|
return { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' };
|
||||||
|
|
||||||
|
case 'pro':
|
||||||
|
return { provider: LLMProvider.OPENAI, model: 'gpt-4o' };
|
||||||
|
|
||||||
|
case 'free':
|
||||||
|
return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
|
||||||
|
|
||||||
|
default:
|
||||||
|
return this.defaultModel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Route to cheapest available model
|
||||||
|
*/
|
||||||
|
private routeByCost(license: UserLicense): ModelConfig {
|
||||||
|
// Free tier: use cheapest
|
||||||
|
if (license.licenseType === 'free') {
|
||||||
|
return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Paid tiers: use GPT-4o-mini for cost efficiency
|
||||||
|
return { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if model is allowed for user's license
|
||||||
|
*/
|
||||||
|
private isModelAllowed(model: ModelConfig, license: UserLicense): boolean {
|
||||||
|
// Free tier: only cheap models
|
||||||
|
if (license.licenseType === 'free') {
|
||||||
|
const allowedModels = ['gemini-2.0-flash-exp', 'gpt-4o-mini', 'claude-3-5-haiku-20241022'];
|
||||||
|
return allowedModels.includes(model.model);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pro: all except Opus
|
||||||
|
if (license.licenseType === 'pro') {
|
||||||
|
const blockedModels = ['claude-3-opus-20240229'];
|
||||||
|
return !blockedModels.includes(model.model);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enterprise: all models allowed
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine if query is complex
|
||||||
|
*/
|
||||||
|
private isComplexQuery(message: string): boolean {
|
||||||
|
const complexityIndicators = [
|
||||||
|
// Multi-step analysis
|
||||||
|
'backtest',
|
||||||
|
'analyze',
|
||||||
|
'compare',
|
||||||
|
'optimize',
|
||||||
|
|
||||||
|
// Code generation
|
||||||
|
'write',
|
||||||
|
'create',
|
||||||
|
'implement',
|
||||||
|
'build',
|
||||||
|
|
||||||
|
// Deep reasoning
|
||||||
|
'explain why',
|
||||||
|
'what if',
|
||||||
|
'how would',
|
||||||
|
|
||||||
|
// Long messages (> 200 chars likely complex)
|
||||||
|
message.length > 200,
|
||||||
|
];
|
||||||
|
|
||||||
|
const messageLower = message.toLowerCase();
|
||||||
|
|
||||||
|
return complexityIndicators.some((indicator) =>
|
||||||
|
typeof indicator === 'string' ? messageLower.includes(indicator) : indicator
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
154
gateway/src/main.ts
Normal file
154
gateway/src/main.ts
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
import Fastify from 'fastify';
|
||||||
|
import websocket from '@fastify/websocket';
|
||||||
|
import cors from '@fastify/cors';
|
||||||
|
import { UserService } from './db/user-service.js';
|
||||||
|
import { Authenticator } from './auth/authenticator.js';
|
||||||
|
import { WebSocketHandler } from './channels/websocket-handler.js';
|
||||||
|
import { TelegramHandler } from './channels/telegram-handler.js';
|
||||||
|
import { KubernetesClient } from './k8s/client.js';
|
||||||
|
import { ContainerManager } from './k8s/container-manager.js';
|
||||||
|
|
||||||
|
const app = Fastify({
|
||||||
|
logger: {
|
||||||
|
level: process.env.LOG_LEVEL || 'info',
|
||||||
|
transport: {
|
||||||
|
target: 'pino-pretty',
|
||||||
|
options: {
|
||||||
|
colorize: true,
|
||||||
|
translateTime: 'HH:MM:ss Z',
|
||||||
|
ignore: 'pid,hostname',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Configuration from environment
|
||||||
|
const config = {
|
||||||
|
port: parseInt(process.env.PORT || '3000'),
|
||||||
|
host: process.env.HOST || '0.0.0.0',
|
||||||
|
databaseUrl: process.env.DATABASE_URL || 'postgresql://localhost/dexorder',
|
||||||
|
|
||||||
|
// LLM provider API keys
|
||||||
|
providerConfig: {
|
||||||
|
anthropicApiKey: process.env.ANTHROPIC_API_KEY,
|
||||||
|
openaiApiKey: process.env.OPENAI_API_KEY,
|
||||||
|
googleApiKey: process.env.GOOGLE_API_KEY,
|
||||||
|
openrouterApiKey: process.env.OPENROUTER_API_KEY,
|
||||||
|
},
|
||||||
|
|
||||||
|
telegramBotToken: process.env.TELEGRAM_BOT_TOKEN || '',
|
||||||
|
|
||||||
|
// Kubernetes configuration
|
||||||
|
kubernetes: {
|
||||||
|
namespace: process.env.KUBERNETES_NAMESPACE || 'dexorder-agents',
|
||||||
|
inCluster: process.env.KUBERNETES_IN_CLUSTER === 'true',
|
||||||
|
context: process.env.KUBERNETES_CONTEXT,
|
||||||
|
agentImage: process.env.AGENT_IMAGE || 'ghcr.io/dexorder/agent:latest',
|
||||||
|
sidecarImage: process.env.SIDECAR_IMAGE || 'ghcr.io/dexorder/lifecycle-sidecar:latest',
|
||||||
|
storageClass: process.env.AGENT_STORAGE_CLASS || 'standard',
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// Validate at least one LLM provider is configured
|
||||||
|
const hasAnyProvider = Object.values(config.providerConfig).some(key => !!key);
|
||||||
|
if (!hasAnyProvider) {
|
||||||
|
app.log.error('At least one LLM provider API key is required (ANTHROPIC_API_KEY, OPENAI_API_KEY, GOOGLE_API_KEY, or OPENROUTER_API_KEY)');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register plugins
|
||||||
|
await app.register(cors, {
|
||||||
|
origin: process.env.CORS_ORIGIN || '*',
|
||||||
|
});
|
||||||
|
|
||||||
|
await app.register(websocket, {
|
||||||
|
options: {
|
||||||
|
maxPayload: 1024 * 1024, // 1MB
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Initialize services
|
||||||
|
const userService = new UserService(config.databaseUrl);
|
||||||
|
|
||||||
|
// Initialize Kubernetes client and container manager
|
||||||
|
const k8sClient = new KubernetesClient({
|
||||||
|
namespace: config.kubernetes.namespace,
|
||||||
|
inCluster: config.kubernetes.inCluster,
|
||||||
|
context: config.kubernetes.context,
|
||||||
|
logger: app.log,
|
||||||
|
});
|
||||||
|
|
||||||
|
const containerManager = new ContainerManager({
|
||||||
|
k8sClient,
|
||||||
|
agentImage: config.kubernetes.agentImage,
|
||||||
|
sidecarImage: config.kubernetes.sidecarImage,
|
||||||
|
storageClass: config.kubernetes.storageClass,
|
||||||
|
namespace: config.kubernetes.namespace,
|
||||||
|
logger: app.log,
|
||||||
|
});
|
||||||
|
|
||||||
|
const authenticator = new Authenticator({
|
||||||
|
userService,
|
||||||
|
containerManager,
|
||||||
|
logger: app.log,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Initialize channel handlers
|
||||||
|
const websocketHandler = new WebSocketHandler({
|
||||||
|
authenticator,
|
||||||
|
providerConfig: config.providerConfig,
|
||||||
|
});
|
||||||
|
|
||||||
|
const telegramHandler = new TelegramHandler({
|
||||||
|
authenticator,
|
||||||
|
providerConfig: config.providerConfig,
|
||||||
|
telegramBotToken: config.telegramBotToken,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Register routes
|
||||||
|
websocketHandler.register(app);
|
||||||
|
telegramHandler.register(app);
|
||||||
|
|
||||||
|
// Health check
|
||||||
|
app.get('/health', async () => {
|
||||||
|
return {
|
||||||
|
status: 'ok',
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Graceful shutdown
|
||||||
|
const shutdown = async () => {
|
||||||
|
app.log.info('Shutting down gracefully...');
|
||||||
|
try {
|
||||||
|
await userService.close();
|
||||||
|
await app.close();
|
||||||
|
app.log.info('Shutdown complete');
|
||||||
|
process.exit(0);
|
||||||
|
} catch (error) {
|
||||||
|
app.log.error({ error }, 'Error during shutdown');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
process.on('SIGTERM', shutdown);
|
||||||
|
process.on('SIGINT', shutdown);
|
||||||
|
|
||||||
|
// Start server
|
||||||
|
try {
|
||||||
|
await app.listen({
|
||||||
|
port: config.port,
|
||||||
|
host: config.host,
|
||||||
|
});
|
||||||
|
|
||||||
|
app.log.info(
|
||||||
|
{
|
||||||
|
port: config.port,
|
||||||
|
host: config.host,
|
||||||
|
},
|
||||||
|
'Gateway server started'
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
app.log.error({ error }, 'Failed to start server');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
37
gateway/src/types/messages.ts
Normal file
37
gateway/src/types/messages.ts
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Inbound user message from any channel
|
||||||
|
*/
|
||||||
|
export const InboundMessageSchema = z.object({
|
||||||
|
messageId: z.string(),
|
||||||
|
userId: z.string(),
|
||||||
|
sessionId: z.string(),
|
||||||
|
content: z.string(),
|
||||||
|
attachments: z.array(z.object({
|
||||||
|
type: z.enum(['image', 'file', 'url']),
|
||||||
|
url: z.string(),
|
||||||
|
mimeType: z.string().optional(),
|
||||||
|
})).optional(),
|
||||||
|
timestamp: z.date(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type InboundMessage = z.infer<typeof InboundMessageSchema>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Outbound response to channel
|
||||||
|
*/
|
||||||
|
export const OutboundMessageSchema = z.object({
|
||||||
|
messageId: z.string(),
|
||||||
|
sessionId: z.string(),
|
||||||
|
content: z.string(),
|
||||||
|
attachments: z.array(z.object({
|
||||||
|
type: z.enum(['image', 'chart', 'file']),
|
||||||
|
url: z.string(),
|
||||||
|
caption: z.string().optional(),
|
||||||
|
})).optional(),
|
||||||
|
metadata: z.record(z.unknown()).optional(),
|
||||||
|
timestamp: z.date(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type OutboundMessage = z.infer<typeof OutboundMessageSchema>;
|
||||||
101
gateway/src/types/resources.ts
Normal file
101
gateway/src/types/resources.ts
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MCP Resource types for user context
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base resource structure from MCP server
|
||||||
|
*/
|
||||||
|
export const MCPResourceSchema = z.object({
|
||||||
|
uri: z.string(),
|
||||||
|
mimeType: z.string().optional(),
|
||||||
|
text: z.string().optional(),
|
||||||
|
blob: z.string().optional(), // base64 encoded
|
||||||
|
});
|
||||||
|
|
||||||
|
export type MCPResource = z.infer<typeof MCPResourceSchema>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User profile context
|
||||||
|
*/
|
||||||
|
export const UserProfileContextSchema = z.object({
|
||||||
|
tradingExperience: z.enum(['beginner', 'intermediate', 'advanced', 'professional']),
|
||||||
|
preferredTimeframes: z.array(z.string()),
|
||||||
|
riskTolerance: z.enum(['low', 'medium', 'high']),
|
||||||
|
tradingStyle: z.string(),
|
||||||
|
favoriteIndicators: z.array(z.string()).optional(),
|
||||||
|
activeTradingPairs: z.array(z.string()).optional(),
|
||||||
|
notes: z.string().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type UserProfileContext = z.infer<typeof UserProfileContextSchema>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Workspace state (current chart, positions, etc.)
|
||||||
|
*/
|
||||||
|
export const WorkspaceStateSchema = z.object({
|
||||||
|
currentChart: z.object({
|
||||||
|
ticker: z.string(),
|
||||||
|
timeframe: z.string(),
|
||||||
|
indicators: z.array(z.string()).optional(),
|
||||||
|
}).optional(),
|
||||||
|
watchlist: z.array(z.string()),
|
||||||
|
openPositions: z.array(z.object({
|
||||||
|
ticker: z.string(),
|
||||||
|
side: z.enum(['long', 'short']),
|
||||||
|
size: z.number(),
|
||||||
|
entryPrice: z.number(),
|
||||||
|
currentPrice: z.number().optional(),
|
||||||
|
unrealizedPnL: z.number().optional(),
|
||||||
|
})),
|
||||||
|
recentAlerts: z.array(z.object({
|
||||||
|
type: z.string(),
|
||||||
|
message: z.string(),
|
||||||
|
timestamp: z.string(),
|
||||||
|
})).optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type WorkspaceState = z.infer<typeof WorkspaceStateSchema>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Standard context resource URIs
|
||||||
|
*/
|
||||||
|
export const CONTEXT_URIS = {
|
||||||
|
USER_PROFILE: 'context://user-profile',
|
||||||
|
CONVERSATION_SUMMARY: 'context://conversation-summary',
|
||||||
|
WORKSPACE_STATE: 'context://workspace-state',
|
||||||
|
SYSTEM_PROMPT: 'context://system-prompt',
|
||||||
|
} as const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resource content interface
|
||||||
|
*/
|
||||||
|
export interface ResourceContent {
|
||||||
|
uri: string;
|
||||||
|
mimeType?: string;
|
||||||
|
text?: string;
|
||||||
|
blob?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper to parse resource content
|
||||||
|
*/
|
||||||
|
export function parseResource<T>(resource: ResourceContent, schema: z.ZodSchema<T>): T | null {
|
||||||
|
if (!resource.text) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Try JSON parsing if mime type is JSON
|
||||||
|
if (resource.mimeType?.includes('json')) {
|
||||||
|
const data = JSON.parse(resource.text);
|
||||||
|
return schema.parse(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise return as-is for text resources
|
||||||
|
return resource.text as T;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
66
gateway/src/types/user.ts
Normal file
66
gateway/src/types/user.ts
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import { z } from 'zod';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Model preference configuration
|
||||||
|
*/
|
||||||
|
export const ModelPreferenceSchema = z.object({
|
||||||
|
provider: z.enum(['anthropic', 'openai', 'google', 'openrouter']),
|
||||||
|
model: z.string(),
|
||||||
|
temperature: z.number().optional(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type ModelPreference = z.infer<typeof ModelPreferenceSchema>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User license and feature authorization
|
||||||
|
*/
|
||||||
|
export const UserLicenseSchema = z.object({
|
||||||
|
userId: z.string(),
|
||||||
|
email: z.string().email().optional(),
|
||||||
|
licenseType: z.enum(['free', 'pro', 'enterprise']),
|
||||||
|
features: z.object({
|
||||||
|
maxIndicators: z.number(),
|
||||||
|
maxStrategies: z.number(),
|
||||||
|
maxBacktestDays: z.number(),
|
||||||
|
realtimeData: z.boolean(),
|
||||||
|
customExecutors: z.boolean(),
|
||||||
|
apiAccess: z.boolean(),
|
||||||
|
}),
|
||||||
|
resourceLimits: z.object({
|
||||||
|
maxConcurrentSessions: z.number(),
|
||||||
|
maxMessagesPerDay: z.number(),
|
||||||
|
maxTokensPerMessage: z.number(),
|
||||||
|
rateLimitPerMinute: z.number(),
|
||||||
|
}),
|
||||||
|
mcpServerUrl: z.string().url(),
|
||||||
|
preferredModel: ModelPreferenceSchema.optional(),
|
||||||
|
expiresAt: z.date().optional(),
|
||||||
|
createdAt: z.date(),
|
||||||
|
updatedAt: z.date(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type UserLicense = z.infer<typeof UserLicenseSchema>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Channel types for multi-channel support
|
||||||
|
*/
|
||||||
|
export enum ChannelType {
|
||||||
|
WEBSOCKET = 'websocket',
|
||||||
|
TELEGRAM = 'telegram',
|
||||||
|
SLACK = 'slack',
|
||||||
|
DISCORD = 'discord',
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Authentication context per channel
|
||||||
|
*/
|
||||||
|
export const AuthContextSchema = z.object({
|
||||||
|
userId: z.string(),
|
||||||
|
channelType: z.nativeEnum(ChannelType),
|
||||||
|
channelUserId: z.string(), // Platform-specific ID (telegram_id, discord_id, etc)
|
||||||
|
sessionId: z.string(),
|
||||||
|
license: UserLicenseSchema,
|
||||||
|
authenticatedAt: z.date(),
|
||||||
|
});
|
||||||
|
|
||||||
|
export type AuthContext = z.infer<typeof AuthContextSchema>;
|
||||||
253
gateway/src/workflows/README.md
Normal file
253
gateway/src/workflows/README.md
Normal file
@@ -0,0 +1,253 @@
|
|||||||
|
# LangGraph Workflows for Trading
|
||||||
|
|
||||||
|
Complex, stateful workflows built with LangGraph for trading-specific tasks.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
LangGraph provides:
|
||||||
|
- **Stateful execution**: Workflow state persists across failures
|
||||||
|
- **Conditional branching**: Route based on market conditions, backtest results, etc.
|
||||||
|
- **Human-in-the-loop**: Pause for user approval before executing trades
|
||||||
|
- **Loops & retries**: Backtest with different parameters, retry failed operations
|
||||||
|
- **Multi-agent**: Different LLMs for different tasks (analysis, risk, execution)
|
||||||
|
|
||||||
|
## Workflows
|
||||||
|
|
||||||
|
### Strategy Analysis (`strategy-analysis.ts`)
|
||||||
|
|
||||||
|
Multi-step pipeline for analyzing trading strategies:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { buildStrategyAnalysisWorkflow } from './workflows/strategy-analysis.js';
|
||||||
|
|
||||||
|
const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn);
|
||||||
|
|
||||||
|
const result = await workflow.invoke({
|
||||||
|
strategyCode: userStrategy,
|
||||||
|
ticker: 'BTC/USDT',
|
||||||
|
timeframe: '1h',
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(result.recommendation); // Go/no-go decision
|
||||||
|
```
|
||||||
|
|
||||||
|
**Steps:**
|
||||||
|
1. **Code Review** - LLM analyzes strategy code for bugs, logic errors
|
||||||
|
2. **Backtest** - Runs backtest via user's MCP server
|
||||||
|
3. **Risk Assessment** - LLM evaluates results (drawdown, Sharpe, etc.)
|
||||||
|
4. **Human Approval** - Pauses for user review
|
||||||
|
5. **Recommendation** - Final go/no-go decision
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- Stateful: Can resume if server restarts
|
||||||
|
- Human-in-the-loop: User must approve before deployment
|
||||||
|
- Multi-step reasoning: Each step builds on previous
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Future Workflows
|
||||||
|
|
||||||
|
### Market Scanner
|
||||||
|
|
||||||
|
Scan multiple tickers for trading opportunities:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
const scanner = buildMarketScannerWorkflow(model, logger);
|
||||||
|
|
||||||
|
const result = await scanner.invoke({
|
||||||
|
tickers: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'],
|
||||||
|
strategies: ['momentum', 'mean_reversion'],
|
||||||
|
timeframe: '1h',
|
||||||
|
});
|
||||||
|
|
||||||
|
// Returns ranked opportunities
|
||||||
|
```
|
||||||
|
|
||||||
|
**Steps:**
|
||||||
|
1. **Fetch Data** - Get OHLC for all tickers
|
||||||
|
2. **Apply Strategies** - Run each strategy on each ticker (parallel)
|
||||||
|
3. **Rank Signals** - Score by confidence, risk/reward
|
||||||
|
4. **Filter** - Apply user's risk limits
|
||||||
|
5. **Return Top N** - Best opportunities
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Portfolio Optimization
|
||||||
|
|
||||||
|
Optimize position sizing across multiple strategies:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
const optimizer = buildPortfolioOptimizerWorkflow(model, logger);
|
||||||
|
|
||||||
|
const result = await optimizer.invoke({
|
||||||
|
strategies: [strategy1, strategy2, strategy3],
|
||||||
|
totalCapital: 100000,
|
||||||
|
maxRiskPerTrade: 0.02,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Returns optimal allocation
|
||||||
|
```
|
||||||
|
|
||||||
|
**Steps:**
|
||||||
|
1. **Backtest All** - Run backtests for each strategy
|
||||||
|
2. **Correlation Analysis** - Check strategy correlation
|
||||||
|
3. **Monte Carlo** - Simulate portfolio performance
|
||||||
|
4. **Optimize** - Find optimal weights (Sharpe maximization)
|
||||||
|
5. **Risk Check** - Validate against user limits
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Trade Execution Monitor
|
||||||
|
|
||||||
|
Monitor trade execution and adapt to market conditions:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
const monitor = buildTradeExecutionWorkflow(model, logger, exchange);
|
||||||
|
|
||||||
|
const result = await monitor.invoke({
|
||||||
|
tradeId: 'xyz',
|
||||||
|
targetPrice: 45000,
|
||||||
|
maxSlippage: 0.001,
|
||||||
|
timeLimit: 60, // seconds
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Steps:**
|
||||||
|
1. **Place Order** - Submit order to exchange
|
||||||
|
2. **Monitor Fill** - Check fill status every second
|
||||||
|
3. **Adapt** - If not filling, adjust price (within slippage)
|
||||||
|
4. **Retry Logic** - If rejected, retry with backoff
|
||||||
|
5. **Timeout** - Cancel if time limit exceeded
|
||||||
|
6. **Report** - Final execution report
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Using Workflows in Gateway
|
||||||
|
|
||||||
|
### Simple Chat vs Complex Workflow
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// gateway/src/orchestrator.ts
|
||||||
|
|
||||||
|
export class MessageOrchestrator {
|
||||||
|
async handleMessage(msg: InboundMessage) {
|
||||||
|
// Route based on complexity
|
||||||
|
if (this.isSimpleQuery(msg)) {
|
||||||
|
// Use agent harness for streaming chat
|
||||||
|
return this.harness.streamMessage(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.isWorkflowRequest(msg)) {
|
||||||
|
// Use LangGraph for complex analysis
|
||||||
|
return this.executeWorkflow(msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async executeWorkflow(msg: InboundMessage) {
|
||||||
|
const { type, params } = this.parseWorkflowRequest(msg);
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case 'analyze_strategy':
|
||||||
|
const workflow = buildStrategyAnalysisWorkflow(...);
|
||||||
|
return await workflow.invoke(params);
|
||||||
|
|
||||||
|
case 'scan_market':
|
||||||
|
const scanner = buildMarketScannerWorkflow(...);
|
||||||
|
return await scanner.invoke(params);
|
||||||
|
|
||||||
|
// ... more workflows
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Benefits for Trading
|
||||||
|
|
||||||
|
### vs Simple LLM Calls
|
||||||
|
|
||||||
|
| Scenario | Simple LLM | LangGraph Workflow |
|
||||||
|
|----------|-----------|-------------------|
|
||||||
|
| "What's the RSI?" | ✅ Fast, streaming | ❌ Overkill |
|
||||||
|
| "Analyze this strategy" | ❌ Limited context | ✅ Multi-step analysis |
|
||||||
|
| "Backtest 10 param combos" | ❌ No loops | ✅ Conditional loops |
|
||||||
|
| "Execute if approved" | ❌ No state | ✅ Human-in-the-loop |
|
||||||
|
| Server crashes mid-analysis | ❌ Lost progress | ✅ Resume from checkpoint |
|
||||||
|
|
||||||
|
### When to Use Workflows
|
||||||
|
|
||||||
|
**Use LangGraph when:**
|
||||||
|
- Multi-step analysis (backtest → risk → approval)
|
||||||
|
- Conditional logic (if bullish → momentum, else → mean-reversion)
|
||||||
|
- Human approval required (pause workflow)
|
||||||
|
- Loops needed (try different parameters)
|
||||||
|
- Long-running (can survive restarts)
|
||||||
|
|
||||||
|
**Use Agent Harness when:**
|
||||||
|
- Simple Q&A ("What is RSI?")
|
||||||
|
- Fast response needed (streaming chat)
|
||||||
|
- Single tool call ("Get my watchlist")
|
||||||
|
- Real-time interaction (Telegram, WebSocket)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
### State Persistence
|
||||||
|
|
||||||
|
LangGraph can persist state to database:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { MemorySaver } from '@langchain/langgraph';
|
||||||
|
|
||||||
|
const checkpointer = new MemorySaver();
|
||||||
|
|
||||||
|
const workflow = graph.compile({ checkpointer });
|
||||||
|
|
||||||
|
// Resume from checkpoint
|
||||||
|
const result = await workflow.invoke(input, {
|
||||||
|
configurable: { thread_id: 'user-123-strategy-analysis' }
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### Human-in-the-Loop
|
||||||
|
|
||||||
|
Pause workflow for user input:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
const workflow = graph
|
||||||
|
.addNode('human_approval', humanApprovalNode)
|
||||||
|
.interrupt('human_approval'); // Pauses here
|
||||||
|
|
||||||
|
// User reviews in UI
|
||||||
|
const approved = await getUserApproval(workflowId);
|
||||||
|
|
||||||
|
// Resume workflow
|
||||||
|
await workflow.resume(state, { approved });
|
||||||
|
```
|
||||||
|
|
||||||
|
### Multi-Agent
|
||||||
|
|
||||||
|
Use different models for different tasks:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
const analysisModel = new ChatAnthropic({ model: 'claude-3-opus' }); // Smart
|
||||||
|
const codeModel = new ChatOpenAI({ model: 'gpt-4o' }); // Good at code
|
||||||
|
const cheapModel = new ChatOpenAI({ model: 'gpt-4o-mini' }); // Fast
|
||||||
|
|
||||||
|
const workflow = graph
|
||||||
|
.addNode('analyze', (state) => analysisModel.invoke(...))
|
||||||
|
.addNode('code_review', (state) => codeModel.invoke(...))
|
||||||
|
.addNode('summarize', (state) => cheapModel.invoke(...));
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. Implement remaining workflows (scanner, optimizer, execution)
|
||||||
|
2. Add state persistence (PostgreSQL checkpointer)
|
||||||
|
3. Integrate human-in-the-loop with WebSocket
|
||||||
|
4. Add workflow monitoring dashboard
|
||||||
|
5. Performance optimization (parallel execution)
|
||||||
162
gateway/src/workflows/strategy-analysis.ts
Normal file
162
gateway/src/workflows/strategy-analysis.ts
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
import { StateGraph, Annotation } from '@langchain/langgraph';
|
||||||
|
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||||
|
import { HumanMessage, SystemMessage } from '@langchain/core/messages';
|
||||||
|
import type { FastifyBaseLogger } from 'fastify';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* State for strategy analysis workflow
|
||||||
|
*/
|
||||||
|
const StrategyAnalysisState = Annotation.Root({
|
||||||
|
strategyCode: Annotation<string>(),
|
||||||
|
ticker: Annotation<string>(),
|
||||||
|
timeframe: Annotation<string>(),
|
||||||
|
|
||||||
|
// Analysis steps
|
||||||
|
codeReview: Annotation<string | null>({
|
||||||
|
default: () => null,
|
||||||
|
}),
|
||||||
|
backtestResults: Annotation<Record<string, unknown> | null>({
|
||||||
|
default: () => null,
|
||||||
|
}),
|
||||||
|
riskAssessment: Annotation<string | null>({
|
||||||
|
default: () => null,
|
||||||
|
}),
|
||||||
|
humanApproved: Annotation<boolean>({
|
||||||
|
default: () => false,
|
||||||
|
}),
|
||||||
|
|
||||||
|
// Final output
|
||||||
|
recommendation: Annotation<string | null>({
|
||||||
|
default: () => null,
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
type StrategyAnalysisStateType = typeof StrategyAnalysisState.State;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build strategy analysis workflow using LangGraph
|
||||||
|
*
|
||||||
|
* Workflow steps:
|
||||||
|
* 1. Code review (LLM analyzes strategy code)
|
||||||
|
* 2. Backtest (calls user's MCP backtest tool)
|
||||||
|
* 3. Risk assessment (LLM evaluates results)
|
||||||
|
* 4. Human approval (pause for user review)
|
||||||
|
* 5. Final recommendation
|
||||||
|
*/
|
||||||
|
export function buildStrategyAnalysisWorkflow(
|
||||||
|
model: BaseChatModel,
|
||||||
|
logger: FastifyBaseLogger,
|
||||||
|
mcpBacktestFn: (strategy: string, ticker: string, timeframe: string) => Promise<Record<string, unknown>>
|
||||||
|
) {
|
||||||
|
// Node: Code Review
|
||||||
|
const codeReviewNode = async (state: StrategyAnalysisStateType) => {
|
||||||
|
logger.info('Strategy workflow: Code review');
|
||||||
|
|
||||||
|
const systemPrompt = `You are an expert trading strategy analyst.
|
||||||
|
Review the following strategy code for potential issues, bugs, or improvements.
|
||||||
|
Focus on: logic errors, edge cases, performance, and trading best practices.`;
|
||||||
|
|
||||||
|
const response = await model.invoke([
|
||||||
|
new SystemMessage(systemPrompt),
|
||||||
|
new HumanMessage(`Review this strategy:\n\n${state.strategyCode}`),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
codeReview: response.content as string,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
// Node: Backtest
|
||||||
|
const backtestNode = async (state: StrategyAnalysisStateType) => {
|
||||||
|
logger.info('Strategy workflow: Running backtest');
|
||||||
|
|
||||||
|
const results = await mcpBacktestFn(state.strategyCode, state.ticker, state.timeframe);
|
||||||
|
|
||||||
|
return {
|
||||||
|
backtestResults: results,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
// Node: Risk Assessment
|
||||||
|
const riskAssessmentNode = async (state: StrategyAnalysisStateType) => {
|
||||||
|
logger.info('Strategy workflow: Risk assessment');
|
||||||
|
|
||||||
|
const systemPrompt = `You are a risk management expert for trading strategies.
|
||||||
|
Analyze the backtest results and provide a risk assessment.
|
||||||
|
Focus on: drawdown, win rate, Sharpe ratio, position sizing, and risk of ruin.`;
|
||||||
|
|
||||||
|
const response = await model.invoke([
|
||||||
|
new SystemMessage(systemPrompt),
|
||||||
|
new HumanMessage(
|
||||||
|
`Code review: ${state.codeReview}\n\nBacktest results: ${JSON.stringify(state.backtestResults, null, 2)}\n\nProvide risk assessment:`
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
riskAssessment: response.content as string,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
// Node: Human Approval (placeholder - would integrate with UI)
|
||||||
|
const humanApprovalNode = async (state: StrategyAnalysisStateType) => {
|
||||||
|
logger.info('Strategy workflow: Awaiting human approval');
|
||||||
|
|
||||||
|
// In real implementation, this would pause and wait for user input
|
||||||
|
// For now, auto-approve
|
||||||
|
return {
|
||||||
|
humanApproved: true,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
// Node: Final Recommendation
|
||||||
|
const recommendationNode = async (state: StrategyAnalysisStateType) => {
|
||||||
|
logger.info('Strategy workflow: Generating recommendation');
|
||||||
|
|
||||||
|
const systemPrompt = `Provide a final recommendation on whether to deploy this trading strategy.
|
||||||
|
Summarize the code review, backtest results, and risk assessment.
|
||||||
|
Give clear go/no-go decision with reasoning.`;
|
||||||
|
|
||||||
|
const response = await model.invoke([
|
||||||
|
new SystemMessage(systemPrompt),
|
||||||
|
new HumanMessage(
|
||||||
|
`Code review: ${state.codeReview}\n\nBacktest: ${JSON.stringify(state.backtestResults)}\n\nRisk: ${state.riskAssessment}\n\nApproved: ${state.humanApproved}\n\nYour recommendation:`
|
||||||
|
),
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
recommendation: response.content as string,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
// Build graph
|
||||||
|
const workflow = new StateGraph(StrategyAnalysisState)
|
||||||
|
.addNode('code_review', codeReviewNode)
|
||||||
|
.addNode('backtest', backtestNode)
|
||||||
|
.addNode('risk_assessment', riskAssessmentNode)
|
||||||
|
.addNode('human_approval', humanApprovalNode)
|
||||||
|
.addNode('recommendation', recommendationNode)
|
||||||
|
.addEdge('__start__', 'code_review')
|
||||||
|
.addEdge('code_review', 'backtest')
|
||||||
|
.addEdge('backtest', 'risk_assessment')
|
||||||
|
.addEdge('risk_assessment', 'human_approval')
|
||||||
|
.addConditionalEdges('human_approval', (state) => {
|
||||||
|
return state.humanApproved ? 'recommendation' : '__end__';
|
||||||
|
})
|
||||||
|
.addEdge('recommendation', '__end__');
|
||||||
|
|
||||||
|
return workflow.compile();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Example usage:
|
||||||
|
*
|
||||||
|
* const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn);
|
||||||
|
*
|
||||||
|
* const result = await workflow.invoke({
|
||||||
|
* strategyCode: "strategy code here",
|
||||||
|
* ticker: "BTC/USDT",
|
||||||
|
* timeframe: "1h",
|
||||||
|
* });
|
||||||
|
*
|
||||||
|
* console.log(result.recommendation);
|
||||||
|
*/
|
||||||
26
gateway/tsconfig.json
Normal file
26
gateway/tsconfig.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2022",
|
||||||
|
"module": "ESNext",
|
||||||
|
"lib": ["ES2022"],
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"resolveJsonModule": true,
|
||||||
|
"allowJs": false,
|
||||||
|
"outDir": "./dist",
|
||||||
|
"rootDir": "./src",
|
||||||
|
"strict": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
"declaration": true,
|
||||||
|
"declarationMap": true,
|
||||||
|
"sourceMap": true,
|
||||||
|
"noUnusedLocals": true,
|
||||||
|
"noUnusedParameters": true,
|
||||||
|
"noImplicitReturns": true,
|
||||||
|
"noFallthroughCasesInSwitch": true,
|
||||||
|
"allowSyntheticDefaultImports": true
|
||||||
|
},
|
||||||
|
"include": ["src/**/*"],
|
||||||
|
"exclude": ["node_modules", "dist"]
|
||||||
|
}
|
||||||
15
lifecycle-sidecar/.gitignore
vendored
Normal file
15
lifecycle-sidecar/.gitignore
vendored
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Binaries
|
||||||
|
lifecycle-sidecar
|
||||||
|
*.exe
|
||||||
|
*.dll
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
|
||||||
|
# Test binary
|
||||||
|
*.test
|
||||||
|
|
||||||
|
# Go workspace file
|
||||||
|
go.work
|
||||||
|
|
||||||
|
# Build output
|
||||||
|
dist/
|
||||||
40
lifecycle-sidecar/Dockerfile
Normal file
40
lifecycle-sidecar/Dockerfile
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# Build stage
|
||||||
|
FROM golang:1.22-alpine AS builder
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install build dependencies
|
||||||
|
RUN apk add --no-cache git ca-certificates
|
||||||
|
|
||||||
|
# Copy go mod files
|
||||||
|
COPY go.mod go.sum ./
|
||||||
|
RUN go mod download
|
||||||
|
|
||||||
|
# Copy source
|
||||||
|
COPY main.go ./
|
||||||
|
|
||||||
|
# Build static binary
|
||||||
|
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
|
||||||
|
-ldflags="-w -s" \
|
||||||
|
-o lifecycle-sidecar \
|
||||||
|
main.go
|
||||||
|
|
||||||
|
# Runtime stage
|
||||||
|
FROM alpine:3.19
|
||||||
|
|
||||||
|
# Install procps for process monitoring (pgrep, kill)
|
||||||
|
RUN apk add --no-cache procps ca-certificates
|
||||||
|
|
||||||
|
# Create non-root user
|
||||||
|
RUN addgroup -g 1000 sidecar && \
|
||||||
|
adduser -D -u 1000 -G sidecar sidecar
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy binary from builder
|
||||||
|
COPY --from=builder /app/lifecycle-sidecar /app/lifecycle-sidecar
|
||||||
|
|
||||||
|
# Run as non-root
|
||||||
|
USER sidecar
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/lifecycle-sidecar"]
|
||||||
94
lifecycle-sidecar/README.md
Normal file
94
lifecycle-sidecar/README.md
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
# Lifecycle Sidecar
|
||||||
|
|
||||||
|
A lightweight Kubernetes sidecar that monitors the main agent container and handles cleanup when the container exits with a specific exit code indicating idle shutdown.
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
User agent containers self-manage their lifecycle by:
|
||||||
|
1. Tracking their own activity (MCP calls, trigger status)
|
||||||
|
2. Exiting with code `42` when idle (no triggers + no recent activity)
|
||||||
|
3. Delegating deployment cleanup to this sidecar
|
||||||
|
|
||||||
|
The sidecar watches the main container and:
|
||||||
|
- On exit code `42`: Deletes the deployment (and optionally PVC)
|
||||||
|
- On any other exit code: Allows Kubernetes restart policy to handle it
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────┐
|
||||||
|
│ Pod │
|
||||||
|
│ ┌────────────────┐ ┌──────────────────┐ │
|
||||||
|
│ │ Agent Container│ │ Lifecycle Sidecar│ │
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ - Track activity │ - Monitor agent │ │
|
||||||
|
│ │ - Track triggers │ - Watch exit code│ │
|
||||||
|
│ │ - Exit 42 if idle │ - Delete if 42 │ │
|
||||||
|
│ └────────────────┘ └──────────────────┘ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ writes exit_code │ │
|
||||||
|
│ └─────────►/var/run/agent/exit_code │
|
||||||
|
│ │ │
|
||||||
|
└───────────────────────────────────┼─────────────┘
|
||||||
|
│
|
||||||
|
▼ k8s API
|
||||||
|
┌──────────────────────┐
|
||||||
|
│ Delete Deployment │
|
||||||
|
│ (+ PVC if anonymous)│
|
||||||
|
└──────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
| Variable | Required | Description |
|
||||||
|
|----------|----------|-------------|
|
||||||
|
| `NAMESPACE` | Yes | Kubernetes namespace (injected via downward API) |
|
||||||
|
| `DEPLOYMENT_NAME` | Yes | Name of the deployment to delete (from pod label) |
|
||||||
|
| `USER_TYPE` | No | User license tier: `anonymous`, `free`, `paid`, `enterprise` |
|
||||||
|
| `MAIN_CONTAINER_PID` | No | PID of main container (for precise monitoring) |
|
||||||
|
|
||||||
|
## Exit Code Contract
|
||||||
|
|
||||||
|
The agent container uses exit codes to signal intent:
|
||||||
|
|
||||||
|
| Exit Code | Meaning | Sidecar Action |
|
||||||
|
|-----------|---------|----------------|
|
||||||
|
| `42` | Clean idle shutdown | Delete deployment + optional PVC |
|
||||||
|
| Any other | Error or normal restart | Allow Kubernetes to restart |
|
||||||
|
|
||||||
|
## RBAC Requirements
|
||||||
|
|
||||||
|
The sidecar requires a ServiceAccount with permission to delete its own deployment:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
rules:
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
resources: ["deployments"]
|
||||||
|
verbs: ["get", "delete"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["persistentvolumeclaims"]
|
||||||
|
verbs: ["get", "delete"]
|
||||||
|
```
|
||||||
|
|
||||||
|
See `deploy/k8s/base/lifecycle-sidecar-rbac.yaml` for the full RBAC configuration.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
|
||||||
|
docker push ghcr.io/dexorder/lifecycle-sidecar:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example Usage
|
||||||
|
|
||||||
|
See `deploy/k8s/base/agent-deployment-example.yaml` for a complete example of how to configure an agent deployment with the lifecycle sidecar.
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
1. **Self-delete only**: The sidecar can only delete the deployment it's part of (enforced by label matching in admission policy)
|
||||||
|
2. **Non-privileged**: Runs as non-root user (UID 1000)
|
||||||
|
3. **Minimal permissions**: Only has `get` and `delete` on deployments/PVCs in the agents namespace
|
||||||
|
4. **No cross-namespace access**: Scoped to `dexorder-agents` namespace only
|
||||||
|
5. **Crash-safe**: Only triggers cleanup on exit code 42, never on crashes
|
||||||
16
lifecycle-sidecar/go.mod
Normal file
16
lifecycle-sidecar/go.mod
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
module github.com/dexorder/lifecycle-sidecar
|
||||||
|
|
||||||
|
go 1.22
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/rs/zerolog v1.32.0
|
||||||
|
k8s.io/api v0.29.2
|
||||||
|
k8s.io/apimachinery v0.29.2
|
||||||
|
k8s.io/client-go v0.29.2
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||||
|
github.com/mattn/go-isatty v0.0.19 // indirect
|
||||||
|
golang.org/x/sys v0.17.0 // indirect
|
||||||
|
)
|
||||||
234
lifecycle-sidecar/main.go
Normal file
234
lifecycle-sidecar/main.go
Normal file
@@ -0,0 +1,234 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/rs/zerolog"
|
||||||
|
"github.com/rs/zerolog/log"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/client-go/kubernetes"
|
||||||
|
"k8s.io/client-go/rest"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// Exit code indicating clean idle shutdown
|
||||||
|
ExitCodeIdleShutdown = 42
|
||||||
|
|
||||||
|
// Poll interval for checking main container status
|
||||||
|
PollInterval = 5 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
// Setup logging
|
||||||
|
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
|
||||||
|
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
|
||||||
|
|
||||||
|
log.Info().Msg("Lifecycle sidecar starting")
|
||||||
|
|
||||||
|
// Get environment configuration
|
||||||
|
namespace := os.Getenv("NAMESPACE")
|
||||||
|
deploymentName := os.Getenv("DEPLOYMENT_NAME")
|
||||||
|
userType := os.Getenv("USER_TYPE")
|
||||||
|
mainContainerPID := os.Getenv("MAIN_CONTAINER_PID")
|
||||||
|
|
||||||
|
if namespace == "" || deploymentName == "" {
|
||||||
|
log.Fatal().Msg("NAMESPACE and DEPLOYMENT_NAME environment variables are required")
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info().
|
||||||
|
Str("namespace", namespace).
|
||||||
|
Str("deployment", deploymentName).
|
||||||
|
Str("userType", userType).
|
||||||
|
Str("mainPID", mainContainerPID).
|
||||||
|
Msg("Configuration loaded")
|
||||||
|
|
||||||
|
// Create Kubernetes client
|
||||||
|
config, err := rest.InClusterConfig()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal().Err(err).Msg("Failed to get in-cluster config")
|
||||||
|
}
|
||||||
|
|
||||||
|
clientset, err := kubernetes.NewForConfig(config)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal().Err(err).Msg("Failed to create Kubernetes client")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for main container to exit
|
||||||
|
exitCode := waitForMainContainer()
|
||||||
|
|
||||||
|
log.Info().Int("exitCode", exitCode).Msg("Main container exited")
|
||||||
|
|
||||||
|
// Handle exit code
|
||||||
|
if exitCode == ExitCodeIdleShutdown {
|
||||||
|
log.Info().Msg("Detected idle shutdown (exit code 42) - cleaning up deployment")
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// Delete PVC if anonymous user
|
||||||
|
deletePVC := userType == "anonymous" || userType == "temporary"
|
||||||
|
|
||||||
|
if err := cleanupDeployment(ctx, clientset, namespace, deploymentName, deletePVC); err != nil {
|
||||||
|
log.Error().Err(err).Msg("Failed to cleanup deployment")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info().Msg("Cleanup complete - sidecar exiting")
|
||||||
|
os.Exit(0)
|
||||||
|
} else {
|
||||||
|
// Any other exit code - let Kubernetes restart policy handle it
|
||||||
|
log.Info().
|
||||||
|
Int("exitCode", exitCode).
|
||||||
|
Msg("Non-idle exit code - allowing Kubernetes to handle restart")
|
||||||
|
os.Exit(exitCode)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// waitForMainContainer monitors the main container process and returns its exit code
|
||||||
|
func waitForMainContainer() int {
|
||||||
|
// Try multiple methods to detect main container exit
|
||||||
|
// Method 1: Poll for process via shared PID namespace
|
||||||
|
mainPID := os.Getenv("MAIN_CONTAINER_PID")
|
||||||
|
if mainPID != "" {
|
||||||
|
return pollProcessExit(mainPID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Method 2: Poll for agent process by name (fallback)
|
||||||
|
log.Info().Msg("MAIN_CONTAINER_PID not set, polling for 'agent' process")
|
||||||
|
return pollProcessByName("agent")
|
||||||
|
}
|
||||||
|
|
||||||
|
// pollProcessExit polls for process exit by PID
|
||||||
|
func pollProcessExit(pidStr string) int {
|
||||||
|
log.Info().Str("pid", pidStr).Msg("Monitoring main container process")
|
||||||
|
|
||||||
|
for {
|
||||||
|
// Check if process exists
|
||||||
|
cmd := exec.Command("kill", "-0", pidStr)
|
||||||
|
err := cmd.Run()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
// Process no longer exists - get exit code from /proc if available
|
||||||
|
log.Info().Msg("Main container process exited")
|
||||||
|
|
||||||
|
// Try to get actual exit code (this is a best-effort)
|
||||||
|
// In Kubernetes, we might not have access to the actual exit code
|
||||||
|
// So we check if the container restarted via container status
|
||||||
|
return getContainerExitCode()
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(PollInterval)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pollProcessByName polls for process exit by name
|
||||||
|
func pollProcessByName(name string) int {
|
||||||
|
log.Info().Str("name", name).Msg("Monitoring main container by name")
|
||||||
|
|
||||||
|
for {
|
||||||
|
cmd := exec.Command("pgrep", "-x", name)
|
||||||
|
err := cmd.Run()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Info().Msg("Main container process exited")
|
||||||
|
return getContainerExitCode()
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(PollInterval)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// getContainerExitCode attempts to retrieve the exit code of the main container
|
||||||
|
// This is challenging in Kubernetes without direct access to container runtime
|
||||||
|
// We use a fallback approach: check a shared file or default to 0
|
||||||
|
func getContainerExitCode() int {
|
||||||
|
// Check if main container wrote exit code to shared volume
|
||||||
|
exitCodeFile := "/var/run/agent/exit_code"
|
||||||
|
data, err := os.ReadFile(exitCodeFile)
|
||||||
|
if err == nil {
|
||||||
|
var exitCode int
|
||||||
|
_, err := fmt.Sscanf(string(data), "%d", &exitCode)
|
||||||
|
if err == nil {
|
||||||
|
log.Info().Int("exitCode", exitCode).Msg("Read exit code from shared file")
|
||||||
|
return exitCode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default to 0 if we can't determine exit code
|
||||||
|
// This is safe because non-42 codes allow restart
|
||||||
|
log.Warn().Msg("Could not determine exit code, defaulting to 0")
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanupDeployment deletes the deployment and optionally the PVC
|
||||||
|
func cleanupDeployment(ctx context.Context, clientset *kubernetes.Clientset, namespace, deploymentName string, deletePVC bool) error {
|
||||||
|
log.Info().
|
||||||
|
Str("namespace", namespace).
|
||||||
|
Str("deployment", deploymentName).
|
||||||
|
Bool("deletePVC", deletePVC).
|
||||||
|
Msg("Cleaning up deployment")
|
||||||
|
|
||||||
|
// Get deployment to find PVC name if needed
|
||||||
|
var pvcName string
|
||||||
|
if deletePVC {
|
||||||
|
deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
log.Warn().Err(err).Msg("Could not get deployment for PVC lookup")
|
||||||
|
} else {
|
||||||
|
// Find PVC from volume claim templates or volumes
|
||||||
|
if len(deployment.Spec.Template.Spec.Volumes) > 0 {
|
||||||
|
for _, vol := range deployment.Spec.Template.Spec.Volumes {
|
||||||
|
if vol.PersistentVolumeClaim != nil {
|
||||||
|
pvcName = vol.PersistentVolumeClaim.ClaimName
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete deployment
|
||||||
|
deletePolicy := metav1.DeletePropagationForeground
|
||||||
|
deleteOptions := metav1.DeleteOptions{
|
||||||
|
PropagationPolicy: &deletePolicy,
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info().Str("deployment", deploymentName).Msg("Deleting deployment")
|
||||||
|
err := clientset.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, deleteOptions)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to delete deployment: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info().Msg("Deployment deleted successfully")
|
||||||
|
|
||||||
|
// Delete PVC if requested and found
|
||||||
|
if deletePVC && pvcName != "" {
|
||||||
|
log.Info().Str("pvc", pvcName).Msg("Deleting PVC")
|
||||||
|
err := clientset.CoreV1().PersistentVolumeClaims(namespace).Delete(ctx, pvcName, metav1.DeleteOptions{})
|
||||||
|
if err != nil {
|
||||||
|
log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to delete PVC (non-fatal)")
|
||||||
|
} else {
|
||||||
|
log.Info().Msg("PVC deleted successfully")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// Register signal handler for graceful shutdown
|
||||||
|
// If sidecar receives SIGTERM, just exit cleanly
|
||||||
|
// Don't trigger deployment deletion on sidecar termination
|
||||||
|
go func() {
|
||||||
|
sigChan := make(chan os.Signal, 1)
|
||||||
|
syscall.Signal(syscall.SIGTERM)
|
||||||
|
<-sigChan
|
||||||
|
log.Info().Msg("Received SIGTERM - sidecar exiting without cleanup")
|
||||||
|
os.Exit(0)
|
||||||
|
}()
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user