container lifecycle management

This commit is contained in:
2026-03-12 15:13:38 -04:00
parent e99ef5d2dd
commit b9cc397e05
61 changed files with 6880 additions and 31 deletions

1
.idea/ai.iml generated
View File

@@ -8,6 +8,7 @@
<excludeFolder url="file://$MODULE_DIR$/.venv" /> <excludeFolder url="file://$MODULE_DIR$/.venv" />
<excludeFolder url="file://$MODULE_DIR$/backend.old/data" /> <excludeFolder url="file://$MODULE_DIR$/backend.old/data" />
<excludeFolder url="file://$MODULE_DIR$/doc.old" /> <excludeFolder url="file://$MODULE_DIR$/doc.old" />
<excludeFolder url="file://$MODULE_DIR$/backend.old" />
</content> </content>
<orderEntry type="jdk" jdkName="Python 3.12 (ai)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Python 3.12 (ai)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />

15
AGENT.md Normal file
View File

@@ -0,0 +1,15 @@
We're building an AI-first trading platform by integrating user-facing TradingView charts and chat with an AI assistant that helps do research, develop indicators (signals), and write strategies, using the Dexorder trading framework we provide.
This monorepo has:
bin/ scripts, mostly build and deploy
deploy/ kubernetes deployment and configuration
doc/ documentation
flink/ Apache Flink application mode processes data from Kafka
iceberg/ Apache Iceberg for historical OHLC etc
ingestor/ Data sources publish to Kafka
kafka/ Apache Kafka
protobuf/ Messaging entities
relay/ Rust+ZeroMQ stateless router
web/ Vue 3 / Pinia / PrimeVue / TradingView
See doc/protocol.md for messaging architecture

View File

@@ -4,6 +4,7 @@
set -e set -e
DIR="$(cd "$(dirname "$0")" && pwd)" DIR="$(cd "$(dirname "$0")" && pwd)"
ROOT_DIR="$(cd "$DIR/.." && pwd)"
echo "Building all container images..." echo "Building all container images..."
echo echo
@@ -13,5 +14,31 @@ echo
"$DIR/build" ingestor "$@" "$DIR/build" ingestor "$@"
"$DIR/build" web "$@" "$DIR/build" web "$@"
# Build lifecycle-sidecar (Go binary, no protobuf sync needed)
echo "Building lifecycle-sidecar..."
cd "$ROOT_DIR/lifecycle-sidecar"
# Determine tag
if [ "$1" == "dev" ]; then
TAG="dev$(date +%Y%m%d%H%M%S)"
else
# Check for uncommitted changes
DIRTY="$(git status | grep 'Changes ' || true)"
if [ "$DIRTY" != "" ]; then
echo "lifecycle-sidecar has uncommitted changes."
echo "Use '$0 dev' to build a development-tagged version instead."
exit 1
fi
TAG="$(git log --oneline | head -1 | cut -d ' ' -f 1)"
fi
REMOTE=${REMOTE:-ghcr.io/dexorder}
docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$TAG .
docker tag lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:$TAG
docker tag $REMOTE/lifecycle-sidecar:$TAG $REMOTE/lifecycle-sidecar:latest
echo "$(date)" built $REMOTE/lifecycle-sidecar:$TAG
echo echo
echo "All images built successfully!" echo "All images built successfully!"

17
bin/dev
View File

@@ -19,7 +19,7 @@ usage() {
echo "Commands:" echo "Commands:"
echo " start Start minikube and deploy all services" echo " start Start minikube and deploy all services"
echo " stop Stop minikube" echo " stop Stop minikube"
echo " restart [svc] Rebuild and redeploy all services, or just one (relay|ingestor|flink)" echo " restart [svc] Rebuild and redeploy all services, or just one (relay|ingestor|flink|sidecar)"
echo " rebuild [svc] Rebuild all custom images, or just one" echo " rebuild [svc] Rebuild all custom images, or just one"
echo " deploy [svc] Deploy/update all services, or just one" echo " deploy [svc] Deploy/update all services, or just one"
echo " status Show status of all services" echo " status Show status of all services"
@@ -127,12 +127,23 @@ rebuild_images() {
docker tag "dexorder/ai-flink:$FLINK_TAG" "dexorder/flink:$FLINK_TAG" docker tag "dexorder/ai-flink:$FLINK_TAG" "dexorder/flink:$FLINK_TAG"
fi fi
# Save the tags for deployment (all three, preserving any we didn't rebuild) # Build lifecycle-sidecar (Go binary)
if [ "$service" == "all" ] || [ "$service" == "lifecycle-sidecar" ] || [ "$service" == "sidecar" ]; then
echo -e "${GREEN}→${NC} Building lifecycle-sidecar..."
cd "$ROOT_DIR/lifecycle-sidecar"
SIDECAR_TAG="dev$(date +%Y%m%d%H%M%S)"
docker build -t lifecycle-sidecar:latest -t lifecycle-sidecar:$SIDECAR_TAG . || exit 1
echo -e "${GREEN}✓ Built lifecycle-sidecar:$SIDECAR_TAG${NC}"
cd "$ROOT_DIR"
fi
# Save the tags for deployment (all services, preserving any we didn't rebuild)
echo "RELAY_TAG=$RELAY_TAG" > "$ROOT_DIR/.dev-image-tag" echo "RELAY_TAG=$RELAY_TAG" > "$ROOT_DIR/.dev-image-tag"
echo "INGEST_TAG=$INGEST_TAG" >> "$ROOT_DIR/.dev-image-tag" echo "INGEST_TAG=$INGEST_TAG" >> "$ROOT_DIR/.dev-image-tag"
echo "FLINK_TAG=$FLINK_TAG" >> "$ROOT_DIR/.dev-image-tag" echo "FLINK_TAG=$FLINK_TAG" >> "$ROOT_DIR/.dev-image-tag"
echo "SIDECAR_TAG=$SIDECAR_TAG" >> "$ROOT_DIR/.dev-image-tag"
echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG${NC}" echo -e "${GREEN}✓ Images built: relay=$RELAY_TAG, ingestor=$INGEST_TAG, flink=$FLINK_TAG, sidecar=$SIDECAR_TAG${NC}"
} }
deploy_services() { deploy_services() {

View File

@@ -0,0 +1,230 @@
"""
Container lifecycle manager for agent containers.
Tracks activity and triggers to determine when the container should shut down.
Exits with code 42 to signal clean idle shutdown to the lifecycle sidecar.
"""
import asyncio
import logging
import os
import signal
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional, Set
logger = logging.getLogger(__name__)
# Exit code to signal clean idle shutdown to sidecar
EXIT_CODE_IDLE_SHUTDOWN = 42
# File to write exit code for sidecar to read
EXIT_CODE_FILE = Path("/var/run/agent/exit_code")
class LifecycleManager:
"""
Manages container lifecycle based on activity and triggers.
The container shuts itself down when:
1. No active triggers (data subscriptions, CEP patterns, etc.)
2. No recent user activity (MCP calls)
3. Idle timeout has elapsed
"""
def __init__(
self,
idle_timeout_minutes: int = 15,
check_interval_seconds: int = 60,
enable_shutdown: bool = True,
):
"""
Initialize lifecycle manager.
Args:
idle_timeout_minutes: Minutes of inactivity before shutdown
check_interval_seconds: Interval between idle checks
enable_shutdown: If False, only log idle state without exiting (for testing)
"""
self.idle_timeout = timedelta(minutes=idle_timeout_minutes)
self.check_interval = check_interval_seconds
self.enable_shutdown = enable_shutdown
self.last_activity: datetime = datetime.now()
self.active_triggers: Set[str] = set()
self._running = False
self._check_task: Optional[asyncio.Task] = None
logger.info(
"Lifecycle manager initialized: idle_timeout=%dm, check_interval=%ds, shutdown_enabled=%s",
idle_timeout_minutes,
check_interval_seconds,
enable_shutdown,
)
def record_activity(self) -> None:
"""
Record user activity (called on MCP tool/resource/prompt invocations).
Resets the idle timer.
"""
self.last_activity = datetime.now()
logger.debug("Activity recorded, idle timer reset")
def update_triggers(self, triggers: Set[str]) -> None:
"""
Update the set of active triggers.
Args:
triggers: Set of active trigger IDs (data subscriptions, CEP patterns, etc.)
"""
if triggers != self.active_triggers:
added = triggers - self.active_triggers
removed = self.active_triggers - triggers
if added:
logger.info("Triggers added: %s", added)
if removed:
logger.info("Triggers removed: %s", removed)
self.active_triggers = triggers
logger.info("Active triggers: %d", len(self.active_triggers))
def add_trigger(self, trigger_id: str) -> None:
"""Add a single trigger."""
if trigger_id not in self.active_triggers:
self.active_triggers.add(trigger_id)
logger.info("Trigger added: %s (total: %d)", trigger_id, len(self.active_triggers))
def remove_trigger(self, trigger_id: str) -> None:
"""Remove a single trigger."""
if trigger_id in self.active_triggers:
self.active_triggers.remove(trigger_id)
logger.info("Trigger removed: %s (total: %d)", trigger_id, len(self.active_triggers))
def is_idle(self) -> bool:
"""
Check if container is idle and should shut down.
Returns:
True if no triggers and idle timeout exceeded
"""
has_triggers = len(self.active_triggers) > 0
idle_time = datetime.now() - self.last_activity
is_past_timeout = idle_time > self.idle_timeout
if has_triggers:
logger.debug("Not idle: has %d active triggers", len(self.active_triggers))
return False
if not is_past_timeout:
logger.debug(
"Not idle: last activity %s ago (timeout: %s)",
idle_time,
self.idle_timeout,
)
return False
logger.info(
"Container is idle: no triggers and %s since last activity", idle_time
)
return True
async def start(self) -> None:
"""Start the lifecycle manager background task."""
if self._running:
logger.warning("Lifecycle manager already running")
return
self._running = True
self._check_task = asyncio.create_task(self._check_loop())
logger.info("Lifecycle manager started")
async def stop(self) -> None:
"""Stop the lifecycle manager."""
self._running = False
if self._check_task:
self._check_task.cancel()
try:
await self._check_task
except asyncio.CancelledError:
pass
logger.info("Lifecycle manager stopped")
async def _check_loop(self) -> None:
"""Background task that periodically checks if container should shut down."""
while self._running:
try:
await asyncio.sleep(self.check_interval)
if self.is_idle():
if self.enable_shutdown:
logger.info("Initiating idle shutdown (exit code %d)", EXIT_CODE_IDLE_SHUTDOWN)
self._write_exit_code(EXIT_CODE_IDLE_SHUTDOWN)
# Give sidecar a moment to see the exit code file
await asyncio.sleep(1)
# Exit with special code
os._exit(EXIT_CODE_IDLE_SHUTDOWN)
else:
logger.info(
"Container is idle but shutdown is disabled (testing mode)"
)
except asyncio.CancelledError:
logger.info("Check loop cancelled")
raise
except Exception as e:
logger.error("Error in lifecycle check loop: %s", e, exc_info=True)
def _write_exit_code(self, code: int) -> None:
"""Write exit code to shared file for sidecar to read."""
try:
EXIT_CODE_FILE.parent.mkdir(parents=True, exist_ok=True)
EXIT_CODE_FILE.write_text(str(code))
logger.debug("Wrote exit code %d to %s", code, EXIT_CODE_FILE)
except Exception as e:
logger.warning("Failed to write exit code file: %s", e)
def setup_signal_handlers(self) -> None:
"""
Setup signal handlers for graceful shutdown.
On SIGTERM/SIGINT, exit normally (not with code 42) to allow restart.
"""
def signal_handler(signum, frame):
logger.info("Received signal %d, exiting normally", signum)
sys.exit(0)
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
# Singleton instance for easy access across the application
_lifecycle_manager: Optional[LifecycleManager] = None
def get_lifecycle_manager() -> LifecycleManager:
"""Get or create the global lifecycle manager instance."""
global _lifecycle_manager
if _lifecycle_manager is None:
# Load configuration from environment
idle_timeout = int(os.environ.get("IDLE_TIMEOUT_MINUTES", "15"))
check_interval = int(os.environ.get("IDLE_CHECK_INTERVAL_SECONDS", "60"))
enable_shutdown = os.environ.get("ENABLE_IDLE_SHUTDOWN", "true").lower() == "true"
_lifecycle_manager = LifecycleManager(
idle_timeout_minutes=idle_timeout,
check_interval_seconds=check_interval,
enable_shutdown=enable_shutdown,
)
return _lifecycle_manager
async def start_lifecycle_manager() -> LifecycleManager:
"""Initialize and start the lifecycle manager."""
manager = get_lifecycle_manager()
manager.setup_signal_handlers()
await manager.start()
return manager

View File

@@ -0,0 +1,43 @@
# openclaw/auth.py
class MCPAuthMiddleware:
"""Authenticates incoming MCP connections based on configured mode."""
def __init__(self, config: AuthConfig):
self.config = config
self._jwks_client = None # lazy-loaded for platform mode
async def authenticate(self, request) -> AuthContext:
match self.config.mode:
case "local":
# stdio transport or localhost-only binding
# No auth needed — if you can exec into the container,
# you're the user
return AuthContext(user_id=self.config.local_user_id,
source="local")
case "token":
# User-generated API key (standalone remote access)
token = extract_bearer_token(request)
if not verify_token_hash(token, self.config.tokens):
raise AuthError("Invalid API token")
return AuthContext(user_id=self.config.local_user_id,
source="api_key")
case "platform":
# JWT signed by the OpenClaw platform
token = extract_bearer_token(request)
claims = await self._verify_platform_jwt(token)
if claims["sub"] != self.config.expected_user_id:
raise AuthError("User ID mismatch")
return AuthContext(user_id=claims["sub"],
source="platform",
scopes=claims.get("scopes", []))
async def _verify_platform_jwt(self, token: str) -> dict:
if not self._jwks_client:
self._jwks_client = JWKSClient(self.config.platform_jwks_url)
signing_key = await self._jwks_client.get_signing_key_from_jwt(token)
return jwt.decode(token, signing_key.key,
algorithms=["RS256"],
audience="openclaw-mcp")

View File

@@ -0,0 +1,110 @@
# ValidatingAdmissionPolicy to restrict images in dexorder-agents namespace
# Requires Kubernetes 1.30+ (or 1.28+ with feature gate)
# This is the critical security control that prevents arbitrary image execution
# even if the gateway is compromised.
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: dexorder-agent-image-policy
spec:
failurePolicy: Fail
matchConstraints:
namespaceSelector:
matchLabels:
dexorder.io/type: agents
resourceRules:
- apiGroups: ["apps"]
apiVersions: ["v1"]
resources: ["deployments"]
operations: ["CREATE", "UPDATE"]
validations:
# Only allow images from our approved registry with agent prefix
- expression: |
object.spec.template.spec.containers.all(c,
c.image.startsWith('ghcr.io/dexorder/agent:') ||
c.image.startsWith('ghcr.io/dexorder/agent-'))
message: "Only approved dexorder agent images are allowed in the agents namespace"
reason: Forbidden
# No privileged containers
- expression: |
object.spec.template.spec.containers.all(c,
!has(c.securityContext) ||
!has(c.securityContext.privileged) ||
c.securityContext.privileged == false)
message: "Privileged containers are not allowed"
reason: Forbidden
# No hostPath volumes
- expression: |
!has(object.spec.template.spec.volumes) ||
object.spec.template.spec.volumes.all(v,
!has(v.hostPath))
message: "hostPath volumes are not allowed"
reason: Forbidden
# No hostNetwork
- expression: |
!has(object.spec.template.spec.hostNetwork) ||
object.spec.template.spec.hostNetwork == false
message: "hostNetwork is not allowed"
reason: Forbidden
# No hostPID
- expression: |
!has(object.spec.template.spec.hostPID) ||
object.spec.template.spec.hostPID == false
message: "hostPID is not allowed"
reason: Forbidden
# Containers must run as non-root
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.runAsNonRoot) &&
c.securityContext.runAsNonRoot == true)
message: "Containers must run as non-root"
reason: Forbidden
# Must drop all capabilities
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.capabilities) &&
has(c.securityContext.capabilities.drop) &&
c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
message: "Containers must drop all capabilities"
reason: Forbidden
# Read-only root filesystem
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.readOnlyRootFilesystem) &&
c.securityContext.readOnlyRootFilesystem == true)
message: "Containers must have read-only root filesystem"
reason: Forbidden
# Resource limits must be set
- expression: |
object.spec.template.spec.containers.all(c,
has(c.resources) &&
has(c.resources.limits) &&
has(c.resources.limits.memory) &&
has(c.resources.limits.cpu))
message: "Containers must have resource limits set"
reason: Forbidden
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicyBinding
metadata:
name: dexorder-agent-image-policy-binding
spec:
policyName: dexorder-agent-image-policy
validationActions:
- Deny
matchResources:
namespaceSelector:
matchLabels:
dexorder.io/type: agents

View File

@@ -0,0 +1,221 @@
# Example agent deployment with lifecycle sidecar
# This would be created by the gateway for each user
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: agent-user-abc123
namespace: dexorder-agents
labels:
app.kubernetes.io/name: agent
app.kubernetes.io/component: user-agent
dexorder.io/component: agent
dexorder.io/user-id: user-abc123
dexorder.io/deployment: agent-user-abc123
spec:
replicas: 1
selector:
matchLabels:
dexorder.io/user-id: user-abc123
template:
metadata:
labels:
dexorder.io/component: agent
dexorder.io/user-id: user-abc123
dexorder.io/deployment: agent-user-abc123
spec:
serviceAccountName: agent-lifecycle
# Share PID namespace so sidecar can monitor main container
shareProcessNamespace: true
# Security context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
# Main agent container
- name: agent
image: ghcr.io/dexorder/agent:latest
imagePullPolicy: Always
# Security context (required by admission policy)
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
# Resource limits (required by admission policy)
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "1000m"
# Environment variables
env:
- name: USER_ID
value: "user-abc123"
- name: IDLE_TIMEOUT_MINUTES
value: "15"
- name: IDLE_CHECK_INTERVAL_SECONDS
value: "60"
- name: ENABLE_IDLE_SHUTDOWN
value: "true"
- name: MCP_SERVER_PORT
value: "3000"
- name: ZMQ_CONTROL_PORT
value: "5555"
# Ports
ports:
- name: mcp
containerPort: 3000
protocol: TCP
- name: zmq-control
containerPort: 5555
protocol: TCP
# Volume mounts
volumeMounts:
- name: agent-data
mountPath: /app/data
- name: tmp
mountPath: /tmp
- name: shared-run
mountPath: /var/run/agent
# Liveness probe (agent's MCP server)
livenessProbe:
httpGet:
path: /health
port: mcp
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 5
# Readiness probe
readinessProbe:
httpGet:
path: /ready
port: mcp
initialDelaySeconds: 5
periodSeconds: 10
# Lifecycle sidecar
- name: lifecycle-sidecar
image: ghcr.io/dexorder/lifecycle-sidecar:latest
imagePullPolicy: Always
# Security context
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
# Resource limits
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "64Mi"
cpu: "50m"
# Environment variables (injected via downward API)
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DEPLOYMENT_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['dexorder.io/deployment']
- name: USER_TYPE
value: "free" # Gateway sets this based on license
- name: MAIN_CONTAINER_PID
value: "1" # In shared PID namespace, main container is typically PID 1
# Volume mounts
volumeMounts:
- name: shared-run
mountPath: /var/run/agent
readOnly: true
# Volumes
volumes:
# Persistent data (user files, state)
- name: agent-data
persistentVolumeClaim:
claimName: agent-user-abc123-data
# Temporary writable filesystem (read-only rootfs)
- name: tmp
emptyDir:
medium: Memory
sizeLimit: 128Mi
# Shared between main container and sidecar
- name: shared-run
emptyDir:
medium: Memory
sizeLimit: 1Mi
# Restart policy
restartPolicy: Always
# Termination grace period
terminationGracePeriodSeconds: 30
---
# PVC for agent persistent data
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: agent-user-abc123-data
namespace: dexorder-agents
labels:
dexorder.io/user-id: user-abc123
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: standard # Or your preferred storage class
---
# Service to expose agent MCP endpoint
apiVersion: v1
kind: Service
metadata:
name: agent-user-abc123
namespace: dexorder-agents
labels:
dexorder.io/user-id: user-abc123
spec:
type: ClusterIP
selector:
dexorder.io/user-id: user-abc123
ports:
- name: mcp
port: 3000
targetPort: mcp
protocol: TCP
- name: zmq-control
port: 5555
targetPort: zmq-control
protocol: TCP

View File

@@ -0,0 +1,53 @@
# Resource constraints for the dexorder-agents namespace
# These limits apply regardless of what the gateway requests
---
# LimitRange: per-container defaults and maximums
apiVersion: v1
kind: LimitRange
metadata:
name: agent-limits
namespace: dexorder-agents
spec:
limits:
# Default limits applied if deployment doesn't specify
- type: Container
default:
memory: "512Mi"
cpu: "500m"
defaultRequest:
memory: "256Mi"
cpu: "100m"
# Maximum any single container can request
max:
memory: "2Gi"
cpu: "2000m"
min:
memory: "64Mi"
cpu: "50m"
# PVC size limits
- type: PersistentVolumeClaim
max:
storage: "10Gi"
min:
storage: "100Mi"
---
# ResourceQuota: total namespace limits
# Prevents a compromised gateway from exhausting cluster resources
apiVersion: v1
kind: ResourceQuota
metadata:
name: agent-quota
namespace: dexorder-agents
spec:
hard:
# Total compute limits for all agents combined
requests.cpu: "20"
requests.memory: "40Gi"
limits.cpu: "40"
limits.memory: "80Gi"
# Object count limits
pods: "100"
persistentvolumeclaims: "100"
services: "100"
# Storage limits
requests.storage: "500Gi"

View File

@@ -0,0 +1,65 @@
# RBAC for gateway to CREATE agent deployments only
# Principle of least privilege: gateway can ONLY create deployments/services/PVCs
# in the dexorder-agents namespace. Deletion is handled by the lifecycle sidecar.
# No pods, secrets, exec, or cross-namespace access.
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gateway
namespace: dexorder-system
---
# Role scoped to dexorder-agents namespace only
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: agent-creator
namespace: dexorder-agents
rules:
# Deployments: create and read only (deletion handled by sidecar)
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["create", "get", "list", "watch", "patch", "update"]
# PVCs: create and read (deletion handled by sidecar)
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["create", "get", "list", "watch"]
# Services: create and manage agent MCP endpoints
- apiGroups: [""]
resources: ["services"]
verbs: ["create", "get", "list", "watch", "patch", "update"]
# Read-only pod access for status checks (no exec!)
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
# Pod logs for debugging (read-only)
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
# Explicitly NOT included:
# - deployments/delete - handled by lifecycle sidecar
# - pvc/delete - handled by lifecycle sidecar
# - services/delete - handled by lifecycle sidecar
# - pods (create/delete) - must go through deployments
# - pods/exec, pods/attach - no shell access
# - secrets, configmaps - no credential access
# - any resources in other namespaces
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: gateway-agent-creator
namespace: dexorder-agents
subjects:
- kind: ServiceAccount
name: gateway
namespace: dexorder-system
roleRef:
kind: Role
name: agent-creator
apiGroup: rbac.authorization.k8s.io

View File

@@ -1,3 +1,6 @@
# Runtime and security initialization for dexorder AI platform
# Apply this first: kubectl apply -f init.yaml
---
apiVersion: node.k8s.io/v1 apiVersion: node.k8s.io/v1
kind: RuntimeClass kind: RuntimeClass
metadata: metadata:

View File

@@ -1,5 +1,26 @@
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: [] resources:
# ingress.yaml - removed until we have services to expose # Core initialization (runtime classes)
- init.yaml
# Namespace definitions with PodSecurity labels
- namespaces.yaml
# RBAC for gateway to create agents (creation only)
- gateway-rbac.yaml
# RBAC for lifecycle sidecar (self-deletion)
- lifecycle-sidecar-rbac.yaml
# Admission policies (image restriction, security requirements)
- admission-policy.yaml
# Resource quotas and limits for agents namespace
- agent-quotas.yaml
# Network isolation policies
- network-policies.yaml
# Gateway service (uncomment when ready)
# - gateway.yaml
# Example agent deployment (for reference, not applied by default)
# - agent-deployment-example.yaml
# Services (uncomment as needed)
# - backend.yaml
# - web.yaml
# - ingress.yaml

View File

@@ -0,0 +1,53 @@
# RBAC for lifecycle sidecar - allows self-deletion only
# Each agent pod gets this ServiceAccount and can only delete its own deployment
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: agent-lifecycle
namespace: dexorder-agents
---
# Role allowing deletion of deployments and PVCs
# This is scoped to the dexorder-agents namespace
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: agent-self-delete
namespace: dexorder-agents
rules:
# Allow getting and deleting deployments
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "delete"]
# Allow getting and deleting PVCs (for anonymous users)
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "delete"]
# Read-only access to pods (for status checking)
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: agent-self-delete
namespace: dexorder-agents
subjects:
- kind: ServiceAccount
name: agent-lifecycle
namespace: dexorder-agents
roleRef:
kind: Role
name: agent-self-delete
apiGroup: rbac.authorization.k8s.io
---
# Additional security: ValidatingWebhookConfiguration to restrict deletion
# This ensures sidecars can only delete their own deployment
# Requires a validating webhook server (can be added later)
# For now, we rely on:
# 1. Sidecar only knowing its own deployment name (from env)
# 2. RBAC limiting to dexorder-agents namespace
# 3. Admission policy restricting deployment creation (already defined)

View File

@@ -0,0 +1,24 @@
# Namespace definitions for dexorder AI platform
# - dexorder-system: gateway, flink, kafka, and other infrastructure
# - dexorder-agents: user agent containers (isolated, restricted)
---
apiVersion: v1
kind: Namespace
metadata:
name: dexorder-system
labels:
app.kubernetes.io/part-of: dexorder
dexorder.io/type: system
---
apiVersion: v1
kind: Namespace
metadata:
name: dexorder-agents
labels:
app.kubernetes.io/part-of: dexorder
dexorder.io/type: agents
# Enforce restricted pod security standards
pod-security.kubernetes.io/enforce: restricted
pod-security.kubernetes.io/enforce-version: latest
pod-security.kubernetes.io/audit: restricted
pod-security.kubernetes.io/warn: restricted

View File

@@ -0,0 +1,121 @@
# Network policies for agent isolation
# Agents can only communicate with specific services, not with each other
# or with the Kubernetes API
---
# Default deny all ingress and egress in agents namespace
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: default-deny-all
namespace: dexorder-agents
spec:
podSelector: {}
policyTypes:
- Ingress
- Egress
---
# Allow agents to receive connections from gateway (MCP)
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-gateway-ingress
namespace: dexorder-agents
spec:
podSelector:
matchLabels:
dexorder.io/component: agent
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
dexorder.io/type: system
podSelector:
matchLabels:
app: gateway
ports:
- protocol: TCP
port: 3000 # MCP server port
- protocol: TCP
port: 5555 # ZeroMQ control channel
---
# Allow agents to connect to required services
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-agent-egress
namespace: dexorder-agents
spec:
podSelector:
matchLabels:
dexorder.io/component: agent
policyTypes:
- Egress
egress:
# DNS resolution (required)
- to:
- namespaceSelector: {}
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- protocol: UDP
port: 53
- protocol: TCP
port: 53
# Gateway in system namespace (for callbacks)
- to:
- namespaceSelector:
matchLabels:
dexorder.io/type: system
podSelector:
matchLabels:
app: gateway
ports:
- protocol: TCP
port: 8080
# Kafka/Redpanda for data subscriptions
- to:
- namespaceSelector:
matchLabels:
dexorder.io/type: system
podSelector:
matchLabels:
app: redpanda
ports:
- protocol: TCP
port: 9092
# External HTTPS (for exchange APIs, LLM APIs)
- to:
- ipBlock:
cidr: 0.0.0.0/0
except:
# Block access to k8s API server (common ranges)
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
ports:
- protocol: TCP
port: 443
---
# System namespace: allow ingress from agents
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-agent-callbacks
namespace: dexorder-system
spec:
podSelector:
matchLabels:
app: gateway
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
dexorder.io/type: agents
ports:
- protocol: TCP
port: 8080

View File

@@ -0,0 +1,97 @@
# Dev admission policy: allow local registry images
# In dev, we also allow images from localhost/minikube registry
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingAdmissionPolicy
metadata:
name: dexorder-agent-image-policy
spec:
failurePolicy: Fail
matchConstraints:
namespaceSelector:
matchLabels:
dexorder.io/type: agents
resourceRules:
- apiGroups: ["apps"]
apiVersions: ["v1"]
resources: ["deployments"]
operations: ["CREATE", "UPDATE"]
validations:
# Allow local dev images in addition to production registry
- expression: |
object.spec.template.spec.containers.all(c,
c.image.startsWith('ghcr.io/dexorder/agent:') ||
c.image.startsWith('ghcr.io/dexorder/agent-') ||
c.image.startsWith('localhost:5000/dexorder/agent') ||
c.image.startsWith('dexorder/agent'))
message: "Only approved dexorder agent images are allowed"
reason: Forbidden
# No privileged containers
- expression: |
object.spec.template.spec.containers.all(c,
!has(c.securityContext) ||
!has(c.securityContext.privileged) ||
c.securityContext.privileged == false)
message: "Privileged containers are not allowed"
reason: Forbidden
# No hostPath volumes
- expression: |
!has(object.spec.template.spec.volumes) ||
object.spec.template.spec.volumes.all(v,
!has(v.hostPath))
message: "hostPath volumes are not allowed"
reason: Forbidden
# No hostNetwork
- expression: |
!has(object.spec.template.spec.hostNetwork) ||
object.spec.template.spec.hostNetwork == false
message: "hostNetwork is not allowed"
reason: Forbidden
# No hostPID
- expression: |
!has(object.spec.template.spec.hostPID) ||
object.spec.template.spec.hostPID == false
message: "hostPID is not allowed"
reason: Forbidden
# Containers must run as non-root
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.runAsNonRoot) &&
c.securityContext.runAsNonRoot == true)
message: "Containers must run as non-root"
reason: Forbidden
# Must drop all capabilities
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.capabilities) &&
has(c.securityContext.capabilities.drop) &&
c.securityContext.capabilities.drop.exists(cap, cap == 'ALL'))
message: "Containers must drop all capabilities"
reason: Forbidden
# Read-only root filesystem
- expression: |
object.spec.template.spec.containers.all(c,
has(c.securityContext) &&
has(c.securityContext.readOnlyRootFilesystem) &&
c.securityContext.readOnlyRootFilesystem == true)
message: "Containers must have read-only root filesystem"
reason: Forbidden
# Resource limits must be set
- expression: |
object.spec.template.spec.containers.all(c,
has(c.resources) &&
has(c.resources.limits) &&
has(c.resources.limits.memory) &&
has(c.resources.limits.cpu))
message: "Containers must have resource limits set"
reason: Forbidden

View File

@@ -0,0 +1,19 @@
# Dev/minikube resource quota overrides
# Smaller limits appropriate for local development
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: agent-quota
namespace: dexorder-agents
spec:
hard:
# Reduced for minikube
requests.cpu: "4"
requests.memory: "8Gi"
limits.cpu: "8"
limits.memory: "16Gi"
pods: "20"
persistentvolumeclaims: "20"
services: "20"
requests.storage: "50Gi"

View File

@@ -1,16 +1,20 @@
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
namespace: default # Note: namespaces are defined in base; workloads go to dexorder-system
namespace: dexorder-system
# Base resources # Base resources (includes security policies)
resources: resources:
- ../base - ../base
- infrastructure.yaml - infrastructure.yaml
# No patches needed currently # Dev-specific patches
patches: [] patches:
# ingress-dev.yaml - removed until we have services to expose # Reduced resource quotas for minikube
- path: agent-quotas-patch.yaml
# Allow local registry images
- path: admission-policy-patch.yaml
# ConfigMaps for service configs # ConfigMaps for service configs
configMapGenerator: configMapGenerator:

View File

@@ -1,9 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
namespace: default # Note: namespaces are defined in base; workloads go to dexorder-system
namespace: dexorder-system
# Base resources (backend, web, ingress, init/gVisor) # Base resources (includes all security policies)
resources: resources:
- ../base - ../base
@@ -38,3 +39,10 @@ images:
newTag: latest newTag: latest
- name: dexorder/ai-web - name: dexorder/ai-web
newTag: latest newTag: latest
- name: ghcr.io/dexorder/gateway
newTag: latest
- name: lifecycle-sidecar
newName: ghcr.io/dexorder/lifecycle-sidecar
newTag: latest
- name: ghcr.io/dexorder/agent
newTag: latest

21
doc/agent_harness_flow.md Normal file
View File

@@ -0,0 +1,21 @@
┌─────────────────────────────────────────────────┐
│ Agent Harness (your servers) │
│ │
│ on_message(user_id, message): │
│ 1. Look up user's MCP endpoint from Postgres │
│ 2. mcp.call("get_context_summary") │
│ 3. mcp.call("get_conversation_history", 20) │
│ 4. Build prompt: │
│ system = BASE_PROMPT │
│ + context_summary │
│ + user_agent_prompt (from MCP) │
│ messages = history + new message │
│ 5. LLM call (your API key) │
│ 6. While LLM wants tool calls: │
│ - Platform tools → handle locally │
│ - User tools → proxy to MCP │
│ - LLM call again with results │
│ 7. mcp.call("save_message", ...) │
│ 8. Return response to user │
│ │
└─────────────────────────────────────────────────┘

View File

@@ -1,9 +1,11 @@
Generally use skills instead of subagents, except for the analysis subagent. Generally use skills instead of subagents, except for the analysis subagent.
## User-specific files ## User-specific files and tools
* Indicators * Indicators
* Strategies * Strategies
* Watchlists * Watchlists
* Preferences * Preferences
* Trading style * Trading style
* Charting / colors * Charting / colors
* Executors (really just sub-strategies)
* tactical-level order generators e.g. TWAP, iceberg, etc.

View File

@@ -1,18 +0,0 @@
This file describes all the configuration options used by all components. All configuration is divided into regular config and secrets, and k8s will mount either or both as a yaml file accessible to the process.
# Configuration
* `flink_hostname`
* ... various zmq ports for flink ...
* `iceberg_catalog_hostname`
* `iceberg_catalog_port`
* `iceberg_catalog_database`
* etc
# Secrets
* `iceberg_catalog_username`
* `iceberg_catalog_password`
* etc.

View File

@@ -0,0 +1,313 @@
# Container Lifecycle Management
## Overview
User agent containers self-manage their lifecycle to optimize resource usage. Containers automatically shut down when idle (no triggers + no recent activity) and clean themselves up using a lifecycle sidecar.
## Architecture
```
┌──────────────────────────────────────────────────────────┐
│ Agent Pod │
│ ┌───────────────────┐ ┌──────────────────────┐ │
│ │ Agent Container │ │ Lifecycle Sidecar │ │
│ │ ─────────────── │ │ ────────────────── │ │
│ │ │ │ │ │
│ │ Lifecycle Manager │ │ Watches exit code │ │
│ │ - Track activity │ │ - Detects exit 42 │ │
│ │ - Track triggers │ │ - Calls k8s API │ │
│ │ - Exit 42 if idle │ │ - Deletes deployment │ │
│ └───────────────────┘ └──────────────────────┘ │
│ │ │ │
│ │ writes exit_code │ │
│ └────►/var/run/agent/exit_code │
│ │ │
└───────────────────────────────────────┼──────────────────┘
▼ k8s API (RBAC)
┌─────────────────────┐
│ Delete Deployment │
│ Delete PVC (if anon)│
└─────────────────────┘
```
## Components
### 1. Lifecycle Manager (Python)
**Location**: `client-py/dexorder/lifecycle_manager.py`
Runs inside the agent container and tracks:
- **Activity**: MCP tool/resource/prompt calls reset the idle timer
- **Triggers**: Data subscriptions, CEP patterns, etc.
- **Idle state**: No triggers + idle timeout exceeded
**Configuration** (via environment variables):
- `IDLE_TIMEOUT_MINUTES`: Minutes before shutdown (default: 15)
- `IDLE_CHECK_INTERVAL_SECONDS`: Check frequency (default: 60)
- `ENABLE_IDLE_SHUTDOWN`: Enable/disable shutdown (default: true)
**Usage in agent code**:
```python
from dexorder.lifecycle_manager import get_lifecycle_manager
# On startup
manager = get_lifecycle_manager()
await manager.start()
# On MCP calls (tool/resource/prompt)
manager.record_activity()
# When triggers change
manager.add_trigger("data_sub_BTC_USDT")
manager.remove_trigger("data_sub_BTC_USDT")
# Or batch update
manager.update_triggers({"trigger_1", "trigger_2"})
```
**Exit behavior**:
- Idle shutdown: Exit with code `42`
- Signal (SIGTERM/SIGINT): Exit with code `0` (allows restart)
- Errors/crashes: Exit with error code (allows restart)
### 2. Lifecycle Sidecar (Go)
**Location**: `lifecycle-sidecar/`
Runs alongside the agent container with shared PID namespace. Monitors the main container process and:
- On exit code `42`: Deletes deployment (and PVC if anonymous user)
- On any other exit code: Exits with same code (k8s restarts pod)
**Configuration** (via environment, injected by downward API):
- `NAMESPACE`: Pod's namespace
- `DEPLOYMENT_NAME`: Deployment name (from pod label)
- `USER_TYPE`: License tier (`anonymous`, `free`, `paid`, `enterprise`)
- `MAIN_CONTAINER_PID`: PID of main container (default: 1)
**RBAC**: Has permission to delete deployments and PVCs **only in dexorder-agents namespace**. Cannot delete other deployments due to:
1. Only knows its own deployment name (from env)
2. RBAC scoped to namespace
3. No cross-pod communication
### 3. Gateway (TypeScript)
**Location**: `gateway/src/harness/agent-harness.ts`
Creates agent deployments when users connect. Has permissions to:
- ✅ Create deployments, services, PVCs
- ✅ Read pod status and logs
- ✅ Update deployments (e.g., resource limits)
- ❌ Delete deployments (handled by sidecar)
- ❌ Exec into pods
- ❌ Access secrets
## Lifecycle States
```
┌─────────────┐
│ CREATED │ ← Gateway creates deployment
└──────┬──────┘
┌─────────────┐
│ RUNNING │ ← User interacts, has triggers
└──────┬──────┘
┌─────────────┐
│ IDLE │ ← No triggers + timeout exceeded
└──────┬──────┘
┌─────────────┐
│ SHUTDOWN │ ← Exit code 42
└──────┬──────┘
┌─────────────┐
│ DELETED │ ← Sidecar deletes deployment
└─────────────┘
```
## Idle Detection Logic
Container is **IDLE** when:
1. `active_triggers.isEmpty()` AND
2. `(now - last_activity) > idle_timeout`
Container is **ACTIVE** when:
1. Has any active triggers (data subscriptions, CEP patterns, etc.) OR
2. Recent user activity (MCP calls within timeout)
## Cleanup Policies by License Tier
| User Type | Idle Timeout | PVC Policy | Notes |
|--------------|--------------|------------|-------|
| Anonymous | 15 minutes | Delete | Ephemeral, no data retention |
| Free | 15 minutes | Retain | Can resume session |
| Paid | 60 minutes | Retain | Longer grace period |
| Enterprise | No shutdown | Retain | Always-on containers |
Configured via `USER_TYPE` env var in deployment.
## Security
### Principle of Least Privilege
**Gateway**:
- Can create agent resources
- Cannot delete agent resources
- Cannot access other namespaces
- Cannot exec into pods
**Lifecycle Sidecar**:
- Can delete its own deployment only
- Cannot delete other deployments
- Scoped to dexorder-agents namespace
- No exec, no secrets access
### Admission Control
All deployments in `dexorder-agents` namespace are subject to:
- Image allowlist (only approved images)
- Security context enforcement (non-root, drop caps, read-only rootfs)
- Resource limits required
- PodSecurity standards (restricted profile)
See `deploy/k8s/base/admission-policy.yaml`
### Network Isolation
Agents are network-isolated via NetworkPolicy:
- Can connect to gateway (MCP)
- Can connect to Redpanda (data streams)
- Can make outbound HTTPS (exchanges, LLM APIs)
- Cannot access k8s API
- Cannot access system namespace
- Cannot access other agent pods
See `deploy/k8s/base/network-policies.yaml`
## Deployment
### 1. Apply Security Policies
```bash
kubectl apply -k deploy/k8s/dev # or prod
```
This creates:
- Namespaces (`dexorder-system`, `dexorder-agents`)
- RBAC (gateway, lifecycle sidecar)
- Admission policies
- Network policies
- Resource quotas
### 2. Build and Push Lifecycle Sidecar
```bash
cd lifecycle-sidecar
docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
docker push ghcr.io/dexorder/lifecycle-sidecar:latest
```
### 3. Gateway Creates Agent Deployments
When a user connects, the gateway creates:
- Deployment with agent + sidecar
- PVC for persistent data
- Service for MCP endpoint
See `deploy/k8s/base/agent-deployment-example.yaml` for template.
## Testing
### Test Lifecycle Manager Locally
```python
from dexorder.lifecycle_manager import LifecycleManager
# Disable actual shutdown for testing
manager = LifecycleManager(
idle_timeout_minutes=1,
check_interval_seconds=10,
enable_shutdown=False # Only log, don't exit
)
await manager.start()
# Simulate activity
manager.record_activity()
# Simulate triggers
manager.add_trigger("test_trigger")
await asyncio.sleep(70) # Wait past timeout
manager.remove_trigger("test_trigger")
await asyncio.sleep(70) # Should detect idle
await manager.stop()
```
### Test Sidecar Locally
```bash
# Build
cd lifecycle-sidecar
go build -o lifecycle-sidecar main.go
# Run (requires k8s config)
export NAMESPACE=dexorder-agents
export DEPLOYMENT_NAME=agent-test
export USER_TYPE=free
./lifecycle-sidecar
```
### Integration Test
1. Deploy test agent with sidecar
2. Verify agent starts and is healthy
3. Stop sending MCP calls and remove all triggers
4. Wait for idle timeout + check interval
5. Verify deployment is deleted
## Troubleshooting
### Container not shutting down when idle
Check logs:
```bash
kubectl logs -n dexorder-agents agent-user-abc123 -c agent
```
Verify:
- `ENABLE_IDLE_SHUTDOWN=true`
- No active triggers: `manager.active_triggers` should be empty
- Idle timeout exceeded
### Sidecar not deleting deployment
Check sidecar logs:
```bash
kubectl logs -n dexorder-agents agent-user-abc123 -c lifecycle-sidecar
```
Verify:
- Exit code file exists: `/var/run/agent/exit_code` contains `42`
- RBAC permissions: `kubectl auth can-i delete deployments --as=system:serviceaccount:dexorder-agents:agent-lifecycle -n dexorder-agents`
- Deployment name matches: Check `DEPLOYMENT_NAME` env var
### Gateway can't create deployments
Check gateway logs and verify:
- ServiceAccount exists: `kubectl get sa gateway -n dexorder-system`
- RoleBinding exists: `kubectl get rolebinding gateway-agent-creator -n dexorder-agents`
- Admission policy allows image: Check image name matches allowlist in `admission-policy.yaml`
## Future Enhancements
1. **Graceful shutdown notifications**: Warn users before shutdown via websocket
2. **Predictive scaling**: Keep frequently-used containers warm
3. **Tiered storage**: Move old PVCs to cheaper storage class
4. **Metrics**: Expose lifecycle metrics (idle rate, shutdown count, etc.)
5. **Cost allocation**: Track resource usage per user/license tier

View File

@@ -0,0 +1,286 @@
# Gateway Container Creation
## Overview
The gateway automatically provisions user agent containers when users authenticate. This ensures each user has their own isolated environment running their MCP server with persistent storage.
## Authentication Flow with Container Creation
```
User connects (WebSocket/Telegram)
Send "Authenticating..." status
Verify token/channel link
Lookup user license from DB
Send "Starting workspace..." status
┌────────────────────────────────────┐
│ ContainerManager.ensureRunning() │
│ ┌──────────────────────────────┐ │
│ │ Check if deployment exists │ │
│ └──────────────────────────────┘ │
│ ↓ │
│ Does it exist? │
│ ↙ ↘ │
│ Yes No │
│ │ │ │
│ │ ┌──────────────────┐ │
│ │ │ Create deployment│ │
│ │ │ Create PVC │ │
│ │ │ Create service │ │
│ │ └──────────────────┘ │
│ │ │ │
│ └────────────┘ │
│ ↓ │
│ Wait for deployment ready │
│ (polls every 2s, timeout 2min) │
│ ↓ │
│ Compute MCP endpoint URL │
│ (internal k8s service DNS) │
└────────────────────────────────────┘
Update license.mcpServerUrl
Send "Connected" status
Initialize AgentHarness
Connect to user's MCP server
Ready for messages
```
## Container Naming Convention
All resources follow a consistent naming pattern based on `userId`:
```typescript
userId: "user-abc123"
deploymentName: "agent-user-abc123"
serviceName: "agent-user-abc123"
pvcName: "agent-user-abc123-data"
mcpEndpoint: "http://agent-user-abc123.dexorder-agents.svc.cluster.local:3000"
```
User IDs are sanitized to be Kubernetes-compliant (lowercase alphanumeric + hyphens).
## Templates by License Tier
Templates are located in `gateway/src/k8s/templates/`:
- `free-tier.yaml`
- `pro-tier.yaml`
- `enterprise-tier.yaml`
### Variable Substitution
Templates use simple string replacement:
- `{{userId}}` - User ID
- `{{deploymentName}}` - Computed deployment name
- `{{serviceName}}` - Computed service name
- `{{pvcName}}` - Computed PVC name
- `{{agentImage}}` - Agent container image (from env)
- `{{sidecarImage}}` - Lifecycle sidecar image (from env)
- `{{storageClass}}` - Kubernetes storage class (from env)
### Resource Limits
| Tier | Memory Request | Memory Limit | CPU Request | CPU Limit | Storage | Idle Timeout |
|------|----------------|--------------|-------------|-----------|---------|--------------|
| **Free** | 256Mi | 512Mi | 100m | 500m | 1Gi | 15min |
| **Pro** | 512Mi | 2Gi | 250m | 2000m | 10Gi | 60min |
| **Enterprise** | 1Gi | 4Gi | 500m | 4000m | 50Gi | Never (shutdown disabled) |
## Components
### KubernetesClient (`gateway/src/k8s/client.ts`)
Low-level k8s API wrapper:
- `deploymentExists(name)` - Check if deployment exists
- `createAgentDeployment(spec)` - Create deployment/service/PVC from template
- `waitForDeploymentReady(name, timeout)` - Poll until ready
- `getServiceEndpoint(name)` - Get service URL
- `deleteAgentDeployment(userId)` - Cleanup (for testing)
Static helpers:
- `getDeploymentName(userId)` - Generate deployment name
- `getServiceName(userId)` - Generate service name
- `getPvcName(userId)` - Generate PVC name
- `getMcpEndpoint(userId, namespace)` - Compute internal service URL
### ContainerManager (`gateway/src/k8s/container-manager.ts`)
High-level orchestration:
- `ensureContainerRunning(userId, license)` - Main entry point
- Returns: `{ mcpEndpoint, wasCreated }`
- Creates deployment if missing
- Waits for ready state
- Returns endpoint URL
- `getContainerStatus(userId)` - Check status without creating
- `deleteContainer(userId)` - Manual cleanup
### Authenticator (`gateway/src/auth/authenticator.ts`)
Updated to call container manager:
- `authenticateWebSocket()` - Calls `ensureContainerRunning()` before returning `AuthContext`
- `authenticateTelegram()` - Same for Telegram webhooks
### WebSocketHandler (`gateway/src/channels/websocket-handler.ts`)
Multi-phase connection protocol:
1. Send `{type: 'status', status: 'authenticating'}`
2. Authenticate (may take 30-120s if creating container)
3. Send `{type: 'status', status: 'initializing'}`
4. Initialize agent harness
5. Send `{type: 'connected', ...}`
This gives the client visibility into the startup process.
## Configuration
Environment variables:
```bash
# Kubernetes
KUBERNETES_NAMESPACE=dexorder-agents
KUBERNETES_IN_CLUSTER=true # false for local dev
KUBERNETES_CONTEXT=minikube # for local dev only
# Container images
AGENT_IMAGE=ghcr.io/dexorder/agent:latest
SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
# Storage
AGENT_STORAGE_CLASS=standard
```
## Security
The gateway uses a restricted ServiceAccount with RBAC:
**Can do:**
- ✅ Create deployments in `dexorder-agents` namespace
- ✅ Create services in `dexorder-agents` namespace
- ✅ Create PVCs in `dexorder-agents` namespace
- ✅ Read pod status and logs (debugging)
- ✅ Update deployments (future: resource scaling)
**Cannot do:**
- ❌ Delete deployments (handled by lifecycle sidecar)
- ❌ Delete PVCs (handled by lifecycle sidecar)
- ❌ Exec into pods
- ❌ Access secrets or configmaps
- ❌ Create resources in other namespaces
- ❌ Access Kubernetes API from agent containers (blocked by NetworkPolicy)
See `deploy/k8s/base/gateway-rbac.yaml` for full configuration.
## Lifecycle
### Container Creation (Gateway)
- User authenticates
- Gateway checks if deployment exists
- If missing, creates from template
- Waits for ready (2min timeout)
- Returns MCP endpoint
### Container Deletion (Lifecycle Sidecar)
- Container tracks activity and triggers
- When idle (no triggers + timeout), exits with code 42
- Sidecar detects exit code 42
- Sidecar deletes deployment + optional PVC via k8s API
- Gateway creates fresh container on next authentication
See `doc/container_lifecycle_management.md` for full lifecycle details.
## Error Handling
| Error | Gateway Action | User Experience |
|-------|----------------|-----------------|
| Deployment creation fails | Log error, return auth failure | "Authentication failed" |
| Wait timeout (image pull, etc.) | Log warning, return 503 | "Service unavailable, retry" |
| Service not found | Retry with backoff | Transparent retry |
| MCP connection fails | Return error | "Failed to connect to workspace" |
| Existing deployment not ready | Wait 30s, continue if still not ready | May connect to partially-ready container |
## Local Development
For local development (outside k8s):
1. Start minikube:
```bash
minikube start
minikube addons enable storage-provisioner
```
2. Apply security policies:
```bash
kubectl apply -k deploy/k8s/dev
```
3. Configure gateway for local k8s:
```bash
# .env
KUBERNETES_IN_CLUSTER=false
KUBERNETES_CONTEXT=minikube
KUBERNETES_NAMESPACE=dexorder-agents
```
4. Run gateway:
```bash
cd gateway
npm run dev
```
5. Connect via WebSocket:
```bash
wscat -c "ws://localhost:3000/ws/chat" -H "Authorization: Bearer your-jwt"
```
The gateway will create deployments in minikube. View with:
```bash
kubectl get deployments -n dexorder-agents
kubectl get pods -n dexorder-agents
kubectl logs -n dexorder-agents agent-user-abc123 -c agent
```
## Production Deployment
1. Build and push gateway image:
```bash
cd gateway
docker build -t ghcr.io/dexorder/gateway:latest .
docker push ghcr.io/dexorder/gateway:latest
```
2. Deploy to k8s:
```bash
kubectl apply -k deploy/k8s/prod
```
3. Gateway runs in `dexorder-system` namespace
4. Creates agent containers in `dexorder-agents` namespace
5. Admission policies enforce image allowlist and security constraints
## Monitoring
Useful metrics to track:
- Container creation latency (time from auth to ready)
- Container creation failure rate
- Active containers by license tier
- Resource usage per tier
- Idle shutdown rate
These can be exported via Prometheus or logged to monitoring service.
## Future Enhancements
1. **Pre-warming**: Create containers for active users before they connect
2. **Image updates**: Handle agent image version migrations with user consent
3. **Multi-region**: Geo-distributed container placement
4. **Cost tracking**: Per-user resource usage and billing
5. **Auto-scaling**: Scale down to 0 replicas instead of deletion (faster restart)
6. **Container pools**: Shared warm containers for anonymous users

View File

@@ -0,0 +1,80 @@
Mode A: Platform Harness → Hosted Container (internal)
Auth: mTLS + platform-signed user claim
Network: k8s internal, never hits the internet
Mode B: Platform Harness → External User Container (remote)
Auth: OAuth2 token issued by your platform
Network: public internet, TLS required
Mode C: Third-party MCP Client → External User Container (standalone)
Auth: User-managed API key or local-only (no network)
Network: localhost or user's own network
┌──────────────────────────────────────────────────────────┐
│ Platform (Postgres) │
│ │
│ users │
│ ├── id, email, password_hash, plan_tier │
│ │ │
│ containers │
│ ├── user_id │
│ ├── type: "hosted" | "external" │
│ ├── mcp_endpoint: "internal-svc:3100" | "https://..." │
│ ├── auth_method: "mtls" | "platform_token" | "api_key" │
│ └── public_key_fingerprint (for pinning external certs) │
│ │
│ api_tokens │
│ ├── user_id │
│ ├── token_hash │
│ ├── scopes: ["mcp:tools", "mcp:resources", "data:read"] │
│ ├── expires_at │
│ └── issued_for: "platform_harness" | "user_direct" │
│ │
└──────────────────────────────────────────────────────────┘
## Mode A
Harness ──mTLS──▶ k8s Service ──▶ User Container MCP
Validates: source is platform namespace
Extracts: user_id from forwarded header
## Mode B
Registration flow (one-time):
1. User provides their MCP endpoint URL in platform settings
2. Platform generates a scoped token (JWT, short-lived, auto-refreshed)
3. User configures their MCP server to accept tokens signed by your platform
4. Platform stores the endpoint + auth method
Runtime:
┌──────────┐ HTTPS + Bearer token ┌────────────────────┐
│ Harness │ ─────────────────────────▶ │ External MCP Server│
│ │ Authorization: │ │
│ │ Bearer <platform_jwt> │ Validates: │
│ │ │ - JWT signature │
│ │ │ (your public │
│ │ │ key, JWKS) │
│ │ │ - user_id claim │
│ │ │ matches self │
│ │ │ - not expired │
└──────────┘ └────────────────────┘
## Mode C
```yaml
# openclaw/config.yaml
auth:
# For local-only use (Claude Desktop, Cursor, etc via stdio)
mode: "local" # no network auth needed
# OR for remote access
mode: "token"
tokens:
- name: "my-laptop"
hash: "sha256:..." # generated by `openclaw token create`
# OR for platform integration
mode: "platform"
platform_jwks_url: "https://api.openclaw.io/.well-known/jwks.json"
expected_user_id: "user_abc123"
```

View File

@@ -0,0 +1,29 @@
MCP Tools (User Container)
├── Memory
│ ├── get_conversation_history(limit)
│ ├── save_message(role, content)
│ ├── search_memory(query) ← semantic search over past conversations
│ └── get_context_summary() ← "who is this user, what do they care about"
├── Strategies & Indicators
│ ├── list_strategies()
│ ├── read_strategy(name)
│ ├── write_strategy(name, code)
│ ├── list_indicators()
│ ├── read_indicator(name)
│ ├── write_indicator(name, code)
│ └── run_backtest(strategy, params)
├── Preferences
│ ├── get_preferences()
│ ├── set_preference(key, value)
│ └── get_agent_prompt() ← user's custom system prompt additions
├── Trading
│ ├── get_watchlist()
│ ├── execute_trade(params)
│ ├── get_positions()
│ └── get_trade_history()
└── Sandbox
└── run_python(code) ← datascience toolset, matplotlib, etc.

472
doc/user_mcp_resources.md Normal file
View File

@@ -0,0 +1,472 @@
# User MCP Server - Resource Architecture
The user's MCP server container owns **all** conversation history, RAG, and contextual data. The platform gateway is a thin, stateless orchestrator that only holds the Anthropic API key.
## Architecture Principle
**User Container = Fat Context**
- Conversation history (PostgreSQL/SQLite)
- RAG system (embeddings, vector search)
- User preferences and custom prompts
- Trading context (positions, watchlists, alerts)
- All user-specific data
**Platform Gateway = Thin Orchestrator**
- Anthropic API key (platform pays for LLM)
- Session management (WebSocket/Telegram connections)
- MCP client connection pooling
- Tool routing (platform vs user tools)
- **Zero conversation state stored**
## MCP Resources for Context Injection
Resources are **read-only** data sources that provide context to the LLM. They're fetched before each Claude API call and embedded in the conversation.
### Standard Context Resources
#### 1. `context://user-profile`
**Purpose:** User's trading background and preferences
**MIME Type:** `text/plain`
**Example Content:**
```
User Profile:
- Trading experience: Intermediate
- Preferred timeframes: 1h, 4h, 1d
- Risk tolerance: Medium
- Focus: Swing trading with technical indicators
- Favorite indicators: RSI, MACD, Bollinger Bands
- Active pairs: BTC/USDT, ETH/USDT, SOL/USDT
```
**Implementation Notes:**
- Stored in user's database `user_preferences` table
- Updated via preference management tools
- Includes inferred data from usage patterns
---
#### 2. `context://conversation-summary`
**Purpose:** Semantic summary of recent conversation with RAG-enhanced context
**MIME Type:** `text/plain`
**Example Content:**
```
Recent Conversation Summary:
Last 10 messages (summarized):
- User asked about moving average crossover strategies
- Discussed backtesting parameters for BTC/USDT
- Reviewed risk management with 2% position sizing
- Explored adding RSI filter to reduce false signals
Relevant past discussions (RAG search):
- 2 weeks ago: Similar strategy development on ETH/USDT
- 1 month ago: User prefers simple strategies over complex ones
- Past preference: Avoid strategies with >5 indicators
Current focus: Optimizing MA crossover with momentum filter
```
**Implementation Notes:**
- Last N messages stored in `conversation_history` table
- RAG search against embeddings of past conversations
- Semantic search using user's current message as query
- ChromaDB/pgvector for embedding storage
- Summary generated on-demand (can be cached for 1-5 minutes)
**RAG Integration:**
```python
async def get_conversation_summary() -> str:
# Get recent messages
recent = await db.get_recent_messages(limit=50)
# Semantic search for relevant context
relevant = await rag.search_conversation_history(
query=recent[-1].content, # Last user message
limit=5,
min_score=0.7
)
# Build summary
return build_summary(recent[-10:], relevant)
```
---
#### 3. `context://workspace-state`
**Purpose:** Current trading workspace (chart, positions, watchlist)
**MIME Type:** `application/json`
**Example Content:**
```json
{
"currentChart": {
"ticker": "BINANCE:BTC/USDT",
"timeframe": "1h",
"indicators": ["SMA(20)", "RSI(14)", "MACD(12,26,9)"]
},
"watchlist": ["BTC/USDT", "ETH/USDT", "SOL/USDT"],
"openPositions": [
{
"ticker": "BTC/USDT",
"side": "long",
"size": 0.1,
"entryPrice": 45000,
"currentPrice": 46500,
"unrealizedPnL": 150
}
],
"recentAlerts": [
{
"type": "price_alert",
"message": "BTC/USDT crossed above $46,000",
"timestamp": "2025-01-15T10:30:00Z"
}
]
}
```
**Implementation Notes:**
- Synced from web client chart state
- Updated via WebSocket sync protocol
- Includes active indicators on current chart
- Position data from trading system
---
#### 4. `context://system-prompt`
**Purpose:** User's custom instructions and preferences for AI behavior
**MIME Type:** `text/plain`
**Example Content:**
```
Custom Instructions:
- Be concise and data-driven
- Always show risk/reward ratios
- Prefer simple strategies over complex ones
- When suggesting trades, include stop-loss and take-profit levels
- Explain your reasoning in trading decisions
```
**Implementation Notes:**
- User-editable in preferences UI
- Appended **last** to system prompt (highest priority)
- Can override platform defaults
- Stored in `user_preferences.custom_prompt` field
---
## MCP Tools for Actions
Tools are for **actions** that have side effects. These are **not** used for context fetching.
### Conversation Management
- `save_message(role, content, timestamp)` - Save message to history
- `search_conversation(query, limit)` - Explicit semantic search (for user queries like "what did we discuss about BTC?")
### Strategy & Indicators
- `list_strategies()` - List user's strategies
- `read_strategy(name)` - Get strategy code
- `write_strategy(name, code)` - Save strategy
- `run_backtest(strategy, params)` - Execute backtest
### Trading
- `get_watchlist()` - Get watchlist (action that may trigger sync)
- `execute_trade(params)` - Execute trade order
- `get_positions()` - Fetch current positions from exchange
### Sandbox
- `run_python(code)` - Execute Python code with data science libraries
---
## Gateway Harness Flow
```typescript
// gateway/src/harness/agent-harness.ts
async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
// 1. Fetch context resources from user's MCP
const contextResources = await fetchContextResources([
'context://user-profile',
'context://conversation-summary', // <-- RAG happens here
'context://workspace-state',
'context://system-prompt',
]);
// 2. Build system prompt from resources
const systemPrompt = buildSystemPrompt(contextResources);
// 3. Build messages with embedded conversation context
const messages = buildMessages(message, contextResources);
// 4. Get tools from MCP
const tools = await mcpClient.listTools();
// 5. Call Claude with embedded context
const response = await anthropic.messages.create({
model: 'claude-3-5-sonnet-20241022',
system: systemPrompt, // <-- User profile + workspace + custom prompt
messages, // <-- Conversation summary from RAG
tools,
});
// 6. Save to user's MCP (tool call)
await mcpClient.callTool('save_message', { role: 'user', content: message.content });
await mcpClient.callTool('save_message', { role: 'assistant', content: response });
return response;
}
```
---
## User MCP Server Implementation (Python)
### Resource Handler
```python
# user-mcp/src/resources.py
from mcp.server import Server
from mcp.types import Resource, ResourceTemplate
import asyncpg
server = Server("dexorder-user")
@server.list_resources()
async def list_resources() -> list[Resource]:
return [
Resource(
uri="context://user-profile",
name="User Profile",
description="Trading style, preferences, and background",
mimeType="text/plain",
),
Resource(
uri="context://conversation-summary",
name="Conversation Summary",
description="Recent conversation with RAG-enhanced context",
mimeType="text/plain",
),
Resource(
uri="context://workspace-state",
name="Workspace State",
description="Current chart, watchlist, positions",
mimeType="application/json",
),
Resource(
uri="context://system-prompt",
name="Custom System Prompt",
description="User's custom AI instructions",
mimeType="text/plain",
),
]
@server.read_resource()
async def read_resource(uri: str) -> str:
if uri == "context://user-profile":
return await build_user_profile()
elif uri == "context://conversation-summary":
return await build_conversation_summary()
elif uri == "context://workspace-state":
return await build_workspace_state()
elif uri == "context://system-prompt":
return await get_custom_prompt()
else:
raise ValueError(f"Unknown resource: {uri}")
```
### RAG Integration
```python
# user-mcp/src/rag.py
import chromadb
from sentence_transformers import SentenceTransformer
class ConversationRAG:
def __init__(self, db_path: str):
self.chroma = chromadb.PersistentClient(path=db_path)
self.collection = self.chroma.get_or_create_collection("conversations")
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
async def search_conversation_history(
self,
query: str,
limit: int = 5,
min_score: float = 0.7
) -> list[dict]:
"""Semantic search over conversation history"""
# Embed query
query_embedding = self.embedder.encode(query).tolist()
# Search
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=limit,
)
# Filter by score and format
relevant = []
for i, score in enumerate(results['distances'][0]):
if score >= min_score:
relevant.append({
'content': results['documents'][0][i],
'metadata': results['metadatas'][0][i],
'score': score,
})
return relevant
async def add_message(self, message_id: str, role: str, content: str, metadata: dict):
"""Add message to RAG index"""
embedding = self.embedder.encode(content).tolist()
self.collection.add(
ids=[message_id],
embeddings=[embedding],
documents=[content],
metadatas=[{
'role': role,
'timestamp': metadata.get('timestamp'),
**metadata
}]
)
```
### Conversation Summary Builder
```python
# user-mcp/src/context.py
async def build_conversation_summary(user_id: str) -> str:
"""Build conversation summary with RAG"""
# 1. Get recent messages
recent_messages = await db.get_messages(
user_id=user_id,
limit=50,
order='desc'
)
# 2. Get current focus (last user message)
last_user_msg = next(
(m for m in recent_messages if m.role == 'user'),
None
)
if not last_user_msg:
return "No recent conversation history."
# 3. RAG search for relevant context
rag = ConversationRAG(f"/data/users/{user_id}/rag")
relevant_context = await rag.search_conversation_history(
query=last_user_msg.content,
limit=5,
min_score=0.7
)
# 4. Build summary
summary = f"Recent Conversation Summary:\n\n"
# Recent messages (last 10)
summary += "Last 10 messages:\n"
for msg in recent_messages[-10:]:
summary += f"- {msg.role}: {msg.content[:100]}...\n"
# Relevant past context
if relevant_context:
summary += "\nRelevant past discussions (RAG):\n"
for ctx in relevant_context:
timestamp = ctx['metadata'].get('timestamp', 'unknown')
summary += f"- [{timestamp}] {ctx['content'][:150]}...\n"
# Inferred focus
summary += f"\nCurrent focus: {infer_topic(last_user_msg.content)}\n"
return summary
def infer_topic(message: str) -> str:
"""Simple topic extraction"""
keywords = {
'strategy': ['strategy', 'backtest', 'trading system'],
'indicator': ['indicator', 'rsi', 'macd', 'moving average'],
'analysis': ['analyze', 'chart', 'price action'],
'risk': ['risk', 'position size', 'stop loss'],
}
message_lower = message.lower()
for topic, words in keywords.items():
if any(word in message_lower for word in words):
return topic
return 'general trading discussion'
```
---
## Benefits of This Architecture
1. **Privacy**: Conversation history never leaves user's container
2. **Customization**: Each user controls their RAG, embeddings, prompt engineering
3. **Scalability**: Platform harness is stateless - horizontally scalable
4. **Cost Control**: Platform pays for Claude, users pay for their compute/storage
5. **Portability**: Users can export/migrate their entire context
6. **Development**: Users can test prompts/context locally without platform involvement
---
## Future Enhancements
### Dynamic Resource URIs
Support parameterized resources:
```
context://conversation/{session_id}
context://strategy/{strategy_name}
context://backtest/{backtest_id}/results
```
### Resource Templates
MCP supports resource templates for dynamic discovery:
```python
@server.list_resource_templates()
async def list_templates() -> list[ResourceTemplate]:
return [
ResourceTemplate(
uriTemplate="context://strategy/{name}",
name="Strategy Context",
description="Context for specific strategy",
)
]
```
### Streaming Resources
For large context (e.g., full backtest results), support streaming:
```python
@server.read_resource()
async def read_resource(uri: str) -> AsyncIterator[str]:
if uri.startswith("context://backtest/"):
async for chunk in stream_backtest_results(uri):
yield chunk
```
---
## Migration Path
For users with existing conversation history in platform DB:
1. **Export script**: Migrate platform history → user container DB
2. **RAG indexing**: Embed all historical messages into ChromaDB
3. **Preference migration**: Copy user preferences to container
4. **Cutover**: Switch to resource-based context fetching
Platform can keep read-only archive for compliance, but active context lives in user container.

9
gateway/.dockerignore Normal file
View File

@@ -0,0 +1,9 @@
node_modules
dist
.env
.env.*
!.env.example
*.log
.git
.gitignore
README.md

39
gateway/.env.example Normal file
View File

@@ -0,0 +1,39 @@
# Server configuration
PORT=3000
HOST=0.0.0.0
LOG_LEVEL=info
CORS_ORIGIN=*
# Database
DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder
# LLM Provider API Keys (configure at least one)
# Anthropic Claude
ANTHROPIC_API_KEY=sk-ant-xxxxx
# OpenAI GPT
OPENAI_API_KEY=sk-xxxxx
# Google Gemini
GOOGLE_API_KEY=xxxxx
# OpenRouter (access to 300+ models with one key)
OPENROUTER_API_KEY=sk-or-xxxxx
# Default model (if user has no preference)
DEFAULT_MODEL_PROVIDER=anthropic
DEFAULT_MODEL=claude-3-5-sonnet-20241022
# Telegram (optional)
TELEGRAM_BOT_TOKEN=
# Kubernetes configuration
KUBERNETES_NAMESPACE=dexorder-agents
KUBERNETES_IN_CLUSTER=false
KUBERNETES_CONTEXT=minikube
AGENT_IMAGE=ghcr.io/dexorder/agent:latest
SIDECAR_IMAGE=ghcr.io/dexorder/lifecycle-sidecar:latest
AGENT_STORAGE_CLASS=standard
# Redis (for session management - future)
# REDIS_URL=redis://localhost:6379

6
gateway/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
node_modules
dist
.env
.env.local
*.log
.DS_Store

313
gateway/ARCHITECTURE.md Normal file
View File

@@ -0,0 +1,313 @@
# Gateway Architecture: LangChain.js + LangGraph
## Why LangChain.js (Not Vercel AI SDK or Direct Anthropic SDK)?
### The Decision
After evaluating Vercel AI SDK and LangChain.js, we chose **LangChain.js + LangGraph** for these reasons:
1. **Multi-model support**: 300+ models via OpenRouter, plus direct integrations
2. **Complex workflows**: LangGraph for stateful trading analysis pipelines
3. **No vendor lock-in**: Switch between Anthropic, OpenAI, Google with one line
4. **Streaming**: Same as Vercel AI SDK (`.stream()` method)
5. **Tool calling**: Unified across all providers
6. **Trading-specific**: State management, conditional branching, human-in-the-loop
**We don't need Vercel AI SDK because:**
- ❌ We use Vue (not React) - don't need React hooks
- ❌ We have Node.js servers (not edge) - don't need edge runtime
-**DO need** complex workflows (strategy analysis, backtesting, approvals)
-**DO need** stateful execution (resume from failures)
---
## Architecture Layers
### Layer 1: Model Abstraction (`src/llm/`)
**Provider Factory** (`provider.ts`)
```typescript
const factory = new LLMProviderFactory(config, logger);
// Create any model
const claude = factory.createModel({
provider: 'anthropic',
model: 'claude-3-5-sonnet-20241022',
});
const gpt4 = factory.createModel({
provider: 'openai',
model: 'gpt-4o',
});
```
**Model Router** (`router.ts`)
```typescript
const router = new ModelRouter(factory, logger);
// Intelligently route based on:
// - User license (free → Gemini Flash, pro → GPT-4, enterprise → Claude)
// - Query complexity (simple → cheap, complex → smart)
// - User preference (if set in license.preferredModel)
// - Cost optimization (always use cheapest)
const model = await router.route(
message.content,
userLicense,
RoutingStrategy.COMPLEXITY
);
```
---
### Layer 2: Agent Harness (`src/harness/`)
**Stateless Orchestrator**
The harness has **ZERO conversation state**. Everything lives in user's MCP container.
**Flow:**
```typescript
async handleMessage(message: InboundMessage) {
// 1. Fetch context from user's MCP (resources, not tools)
const resources = await mcpClient.listResources();
const context = await Promise.all([
mcpClient.readResource('context://user-profile'), // Trading style
mcpClient.readResource('context://conversation-summary'), // RAG summary
mcpClient.readResource('context://workspace-state'), // Current chart
mcpClient.readResource('context://system-prompt'), // Custom instructions
]);
// 2. Route to appropriate model
const model = await modelRouter.route(message, license);
// 3. Build messages with embedded context
const messages = buildLangChainMessages(systemPrompt, context);
// 4. Call LLM
const response = await model.invoke(messages);
// 5. Save to user's MCP (tool call)
await mcpClient.callTool('save_message', { role: 'user', content: message });
await mcpClient.callTool('save_message', { role: 'assistant', content: response });
return response;
}
```
**Streaming variant:**
```typescript
async *streamMessage(message: InboundMessage) {
const model = await modelRouter.route(message, license);
const messages = buildMessages(context, message);
const stream = await model.stream(messages);
let fullResponse = '';
for await (const chunk of stream) {
fullResponse += chunk.content;
yield chunk.content; // Stream to WebSocket/Telegram
}
// Save after streaming completes
await mcpClient.callTool('save_message', { /* ... */ });
}
```
---
### Layer 3: Workflows (`src/workflows/`)
**LangGraph for Complex Trading Analysis**
```typescript
// Example: Strategy Analysis Pipeline
const workflow = new StateGraph(StrategyAnalysisState)
.addNode('code_review', async (state) => {
const model = new ChatAnthropic({ model: 'claude-3-opus' });
const review = await model.invoke(`Review: ${state.strategyCode}`);
return { codeReview: review.content };
})
.addNode('backtest', async (state) => {
// Call user's MCP backtest tool
const results = await mcpClient.callTool('run_backtest', {
strategy: state.strategyCode,
ticker: state.ticker,
});
return { backtestResults: results };
})
.addNode('risk_assessment', async (state) => {
const model = new ChatAnthropic({ model: 'claude-3-5-sonnet' });
const assessment = await model.invoke(
`Analyze risk: ${JSON.stringify(state.backtestResults)}`
);
return { riskAssessment: assessment.content };
})
.addNode('human_approval', async (state) => {
// Pause for user review (human-in-the-loop)
return { humanApproved: await waitForUserApproval(state) };
})
.addConditionalEdges('human_approval', (state) => {
return state.humanApproved ? 'deploy' : 'reject';
})
.compile();
// Execute
const result = await workflow.invoke({
strategyCode: userCode,
ticker: 'BTC/USDT',
timeframe: '1h',
});
```
**Benefits:**
- **Stateful**: Resume if server crashes mid-analysis
- **Conditional**: Route based on results (if Sharpe > 2 → deploy, else → reject)
- **Human-in-the-loop**: Pause for user approval
- **Multi-step**: Each node can use different models
---
## User Context Architecture
### MCP Resources (Not Tools)
**User's MCP server exposes resources** (read-only context):
```
context://user-profile → Trading style, preferences
context://conversation-summary → RAG-generated summary
context://workspace-state → Current chart, positions
context://system-prompt → User's custom AI instructions
```
**Gateway fetches and embeds in LLM call:**
```typescript
const userProfile = await mcpClient.readResource('context://user-profile');
const conversationSummary = await mcpClient.readResource('context://conversation-summary');
// User's MCP server runs RAG search and returns summary
// Gateway embeds this in Claude/GPT prompt
```
**Why resources, not tools?**
- Resources = context injection (read-only)
- Tools = actions (write operations)
- Context should be fetched **before** LLM call, not during
---
## Model Routing Strategies
### 1. User Preference
```typescript
// User's license has preferred model
{
"preferredModel": {
"provider": "anthropic",
"model": "claude-3-5-sonnet-20241022"
}
}
// Router uses this if set
```
### 2. Complexity-Based
```typescript
const isComplex = message.includes('backtest') || message.length > 200;
if (isComplex) {
return { provider: 'anthropic', model: 'claude-3-opus' }; // Smart
} else {
return { provider: 'openai', model: 'gpt-4o-mini' }; // Fast
}
```
### 3. License Tier
```typescript
switch (license.licenseType) {
case 'free':
return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Cheap
case 'pro':
return { provider: 'openai', model: 'gpt-4o' }; // Balanced
case 'enterprise':
return { provider: 'anthropic', model: 'claude-3-5-sonnet' }; // Premium
}
```
### 4. Cost-Optimized
```typescript
return { provider: 'google', model: 'gemini-2.0-flash-exp' }; // Always cheapest
```
---
## When to Use What
### Simple Chat → Agent Harness
```typescript
// User: "What's the RSI on BTC?"
// → Fast streaming response via harness.streamMessage()
```
### Complex Analysis → LangGraph Workflow
```typescript
// User: "Analyze this strategy and backtest it"
// → Multi-step workflow: code review → backtest → risk → approval
```
### Direct Tool Call → MCP Client
```typescript
// User: "Get my watchlist"
// → Direct MCP tool call, no LLM needed
```
---
## Data Flow
```
User Message ("Analyze my strategy")
Gateway → Route to workflow (not harness)
LangGraph Workflow:
├─ Node 1: Code Review (Claude Opus)
│ └─ Analyzes strategy code
├─ Node 2: Backtest (MCP tool call)
│ └─ User's container runs backtest
├─ Node 3: Risk Assessment (Claude Sonnet)
│ └─ Evaluates results
├─ Node 4: Human Approval (pause)
│ └─ User reviews in UI
└─ Node 5: Recommendation (GPT-4o-mini)
└─ Final decision
Result → Return to user
```
---
## Benefits Summary
| Feature | LangChain.js | Vercel AI SDK | Direct Anthropic SDK |
|---------|--------------|---------------|----------------------|
| Multi-model | ✅ 300+ models | ✅ 100+ models | ❌ Anthropic only |
| Streaming | ✅ `.stream()` | ✅ `streamText()` | ✅ `.stream()` |
| Tool calling | ✅ Unified | ✅ Unified | ✅ Anthropic format |
| Complex workflows | ✅ LangGraph | ❌ Limited | ❌ DIY |
| Stateful agents | ✅ LangGraph | ❌ No | ❌ No |
| Human-in-the-loop | ✅ LangGraph | ❌ No | ❌ No |
| React hooks | ❌ N/A | ✅ `useChat()` | ❌ N/A |
| Bundle size | Large (101kb) | Small (30kb) | Medium (60kb) |
| **Dexorder needs** | **✅ Perfect fit** | **❌ Missing workflows** | **❌ Vendor lock-in** |
---
## Next Steps
1. **Implement tool calling** in agent harness (bind MCP tools to LangChain)
2. **Add state persistence** for LangGraph (PostgreSQL checkpointer)
3. **Build more workflows**: market scanner, portfolio optimizer
4. **Add monitoring**: Track model usage, costs, latency
5. **User container**: Implement Python MCP server with resources

40
gateway/Dockerfile Normal file
View File

@@ -0,0 +1,40 @@
FROM node:22-alpine AS builder
WORKDIR /app
# Copy package files
COPY package*.json ./
COPY tsconfig.json ./
# Install dependencies
RUN npm ci
# Copy source
COPY src ./src
# Build
RUN npm run build
# Production image
FROM node:22-alpine
WORKDIR /app
# Copy package files
COPY package*.json ./
# Install production dependencies only
RUN npm ci --omit=dev
# Copy built application
COPY --from=builder /app/dist ./dist
# Create non-root user
RUN addgroup -g 1001 -S nodejs && \
adduser -S nodejs -u 1001
USER nodejs
EXPOSE 3000
CMD ["node", "dist/main.js"]

212
gateway/README.md Normal file
View File

@@ -0,0 +1,212 @@
# Dexorder Gateway
Multi-channel gateway with agent harness for the Dexorder AI platform.
## Architecture
```
┌─────────────────────────────────────────────────────────┐
│ Platform Gateway │
│ (Node.js/Fastify) │
│ │
│ ┌────────────────────────────────────────────────┐ │
│ │ Channels │ │
│ │ - WebSocket (/ws/chat) │ │
│ │ - Telegram Webhook (/webhook/telegram) │ │
│ └────────────────────────────────────────────────┘ │
│ ↕ │
│ ┌────────────────────────────────────────────────┐ │
│ │ Authenticator │ │
│ │ - JWT verification (WebSocket) │ │
│ │ - Channel linking (Telegram) │ │
│ │ - User license lookup (PostgreSQL) │ │
│ └────────────────────────────────────────────────┘ │
│ ↕ │
│ ┌────────────────────────────────────────────────┐ │
│ │ Agent Harness (per-session) │ │
│ │ - Claude API integration │ │
│ │ - MCP client connector │ │
│ │ - Conversation state │ │
│ └────────────────────────────────────────────────┘ │
│ ↕ │
│ ┌────────────────────────────────────────────────┐ │
│ │ MCP Client │ │
│ │ - User container connection │ │
│ │ - Tool routing │ │
│ └────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────┘
┌───────────────────────────────┐
│ User MCP Server (Python) │
│ - Strategies, indicators │
│ - Memory, preferences │
│ - Backtest sandbox │
└───────────────────────────────────┘
```
## Features
- **Automatic container provisioning**: Creates user agent containers on-demand via Kubernetes
- **Multi-channel support**: WebSocket and Telegram webhooks
- **Per-channel authentication**: JWT for web, channel linking for chat apps
- **User license management**: Feature flags and resource limits from PostgreSQL
- **Container lifecycle management**: Auto-shutdown on idle (handled by container sidecar)
- **License-based resources**: Different memory/CPU/storage limits per tier
- **Multi-model LLM support**: Anthropic Claude, OpenAI GPT, Google Gemini, OpenRouter (300+ models)
- **Zero vendor lock-in**: Switch models with one line, powered by LangChain.js
- **Intelligent routing**: Auto-select models based on complexity, license tier, or user preference
- **Streaming responses**: Real-time chat with WebSocket and Telegram
- **Complex workflows**: LangGraph for stateful trading analysis (backtest → risk → approval)
- **Agent harness**: Stateless orchestrator (all context lives in user's MCP container)
- **MCP resource integration**: User's RAG, conversation history, and preferences
## Container Management
When a user authenticates, the gateway:
1. **Checks for existing container**: Queries Kubernetes for deployment
2. **Creates if missing**: Renders YAML template based on license tier
3. **Waits for ready**: Polls deployment status until healthy
4. **Returns MCP endpoint**: Computed from service name
5. **Connects to MCP server**: Proceeds with normal authentication flow
Container templates by license tier:
| Tier | Memory | CPU | Storage | Idle Timeout |
|------|--------|-----|---------|--------------|
| Free | 512Mi | 500m | 1Gi | 15min |
| Pro | 2Gi | 2000m | 10Gi | 60min |
| Enterprise | 4Gi | 4000m | 50Gi | Never |
Containers self-manage their lifecycle using the lifecycle sidecar (see `../lifecycle-sidecar/`)
## Setup
### Prerequisites
- Node.js >= 22.0.0
- PostgreSQL database
- At least one LLM provider API key:
- Anthropic Claude
- OpenAI GPT
- Google Gemini
- OpenRouter (one key for 300+ models)
### Development
1. Install dependencies:
```bash
npm install
```
2. Copy environment template:
```bash
cp .env.example .env
```
3. Configure `.env` (see `.env.example`):
```bash
DATABASE_URL=postgresql://postgres:postgres@localhost:5432/dexorder
# Configure at least one provider
ANTHROPIC_API_KEY=sk-ant-xxxxx
# OPENAI_API_KEY=sk-xxxxx
# GOOGLE_API_KEY=xxxxx
# OPENROUTER_API_KEY=sk-or-xxxxx
# Optional: Set default model
DEFAULT_MODEL_PROVIDER=anthropic
DEFAULT_MODEL=claude-3-5-sonnet-20241022
```
4. Run development server:
```bash
npm run dev
```
### Production Build
```bash
npm run build
npm start
```
### Docker
```bash
docker build -t dexorder/gateway:latest .
docker run -p 3000:3000 --env-file .env dexorder/gateway:latest
```
## Database Schema
Required PostgreSQL tables (will be documented separately):
### `user_licenses`
- `user_id` (text, primary key)
- `email` (text)
- `license_type` (text: 'free', 'pro', 'enterprise')
- `features` (jsonb)
- `resource_limits` (jsonb)
- `mcp_server_url` (text)
- `expires_at` (timestamp, nullable)
- `created_at` (timestamp)
- `updated_at` (timestamp)
### `user_channel_links`
- `id` (serial, primary key)
- `user_id` (text, foreign key)
- `channel_type` (text: 'telegram', 'slack', 'discord')
- `channel_user_id` (text)
- `created_at` (timestamp)
## API Endpoints
### WebSocket
**`GET /ws/chat`**
- WebSocket connection for web client
- Auth: Bearer token in headers
- Protocol: JSON messages
Example:
```javascript
const ws = new WebSocket('ws://localhost:3000/ws/chat', {
headers: {
'Authorization': 'Bearer your-jwt-token'
}
});
ws.on('message', (data) => {
const msg = JSON.parse(data);
console.log(msg);
});
ws.send(JSON.stringify({
type: 'message',
content: 'Hello, AI!'
}));
```
### Telegram Webhook
**`POST /webhook/telegram`**
- Telegram bot webhook endpoint
- Auth: Telegram user linked to platform user
- Automatically processes incoming messages
### Health Check
**`GET /health`**
- Returns server health status
## TODO
- [ ] Implement JWT verification with JWKS
- [ ] Implement MCP HTTP/SSE transport
- [ ] Add Redis for session persistence
- [ ] Add rate limiting per user license
- [ ] Add message usage tracking
- [ ] Add streaming responses for WebSocket
- [ ] Add Slack and Discord channel handlers
- [ ] Add session cleanup/timeout logic

42
gateway/package.json Normal file
View File

@@ -0,0 +1,42 @@
{
"name": "@dexorder/gateway",
"version": "0.1.0",
"type": "module",
"private": true,
"description": "Multi-channel gateway with agent harness for Dexorder AI platform",
"scripts": {
"dev": "tsx watch src/main.ts",
"build": "tsc",
"start": "node dist/main.js",
"typecheck": "tsc --noEmit"
},
"dependencies": {
"@fastify/cors": "^10.0.1",
"@fastify/websocket": "^11.0.1",
"@kubernetes/client-node": "^0.21.0",
"@langchain/anthropic": "^0.3.8",
"@langchain/core": "^0.3.24",
"@langchain/google-genai": "^0.1.6",
"@langchain/langgraph": "^0.2.26",
"@langchain/openai": "^0.3.21",
"@langchain/openrouter": "^0.1.2",
"@modelcontextprotocol/sdk": "^1.0.4",
"fastify": "^5.2.0",
"ioredis": "^5.4.2",
"js-yaml": "^4.1.0",
"pg": "^8.13.1",
"pino": "^9.6.0",
"pino-pretty": "^13.0.0",
"zod": "^3.24.1"
},
"devDependencies": {
"@types/js-yaml": "^4.0.9",
"@types/node": "^22.10.2",
"@types/pg": "^8.11.10",
"tsx": "^4.19.2",
"typescript": "^5.7.2"
},
"engines": {
"node": ">=22.0.0"
}
}

79
gateway/schema.sql Normal file
View File

@@ -0,0 +1,79 @@
-- User license and authorization schema
CREATE TABLE IF NOT EXISTS user_licenses (
user_id TEXT PRIMARY KEY,
email TEXT,
license_type TEXT NOT NULL CHECK (license_type IN ('free', 'pro', 'enterprise')),
features JSONB NOT NULL DEFAULT '{
"maxIndicators": 5,
"maxStrategies": 3,
"maxBacktestDays": 30,
"realtimeData": false,
"customExecutors": false,
"apiAccess": false
}',
resource_limits JSONB NOT NULL DEFAULT '{
"maxConcurrentSessions": 1,
"maxMessagesPerDay": 100,
"maxTokensPerMessage": 4096,
"rateLimitPerMinute": 10
}',
mcp_server_url TEXT NOT NULL,
preferred_model JSONB DEFAULT NULL,
expires_at TIMESTAMP WITH TIME ZONE,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
);
COMMENT ON COLUMN user_licenses.preferred_model IS 'Optional model preference: {"provider": "anthropic", "model": "claude-3-5-sonnet-20241022", "temperature": 0.7}';
CREATE INDEX idx_user_licenses_expires_at ON user_licenses(expires_at)
WHERE expires_at IS NOT NULL;
-- Channel linking for multi-channel support
CREATE TABLE IF NOT EXISTS user_channel_links (
id SERIAL PRIMARY KEY,
user_id TEXT NOT NULL REFERENCES user_licenses(user_id) ON DELETE CASCADE,
channel_type TEXT NOT NULL CHECK (channel_type IN ('telegram', 'slack', 'discord', 'websocket')),
channel_user_id TEXT NOT NULL,
metadata JSONB,
created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(),
UNIQUE(channel_type, channel_user_id)
);
CREATE INDEX idx_user_channel_links_user_id ON user_channel_links(user_id);
CREATE INDEX idx_user_channel_links_channel ON user_channel_links(channel_type, channel_user_id);
-- Example data for development
INSERT INTO user_licenses (user_id, email, license_type, mcp_server_url, features, resource_limits, preferred_model)
VALUES (
'dev-user-001',
'dev@example.com',
'pro',
'http://localhost:8080/mcp',
'{
"maxIndicators": 50,
"maxStrategies": 20,
"maxBacktestDays": 365,
"realtimeData": true,
"customExecutors": true,
"apiAccess": true
}',
'{
"maxConcurrentSessions": 5,
"maxMessagesPerDay": 1000,
"maxTokensPerMessage": 8192,
"rateLimitPerMinute": 60
}',
'{
"provider": "anthropic",
"model": "claude-3-5-sonnet-20241022",
"temperature": 0.7
}'
)
ON CONFLICT (user_id) DO NOTHING;
-- Example Telegram link
INSERT INTO user_channel_links (user_id, channel_type, channel_user_id)
VALUES ('dev-user-001', 'telegram', '123456789')
ON CONFLICT (channel_type, channel_user_id) DO NOTHING;

View File

@@ -0,0 +1,146 @@
import type { FastifyRequest, FastifyBaseLogger } from 'fastify';
import { UserService } from '../db/user-service.js';
import { ChannelType, type AuthContext } from '../types/user.js';
import type { ContainerManager } from '../k8s/container-manager.js';
export interface AuthenticatorConfig {
userService: UserService;
containerManager: ContainerManager;
logger: FastifyBaseLogger;
}
/**
* Multi-channel authenticator
* Handles authentication for WebSocket, Telegram, and other channels
*/
export class Authenticator {
private config: AuthenticatorConfig;
constructor(config: AuthenticatorConfig) {
this.config = config;
}
/**
* Authenticate WebSocket connection via JWT token
* Also ensures the user's container is running
*/
async authenticateWebSocket(
request: FastifyRequest
): Promise<AuthContext | null> {
try {
const token = this.extractBearerToken(request);
if (!token) {
this.config.logger.warn('No bearer token in WebSocket connection');
return null;
}
const userId = await this.config.userService.verifyWebToken(token);
if (!userId) {
this.config.logger.warn('Invalid JWT token');
return null;
}
const license = await this.config.userService.getUserLicense(userId);
if (!license) {
this.config.logger.warn({ userId }, 'User license not found');
return null;
}
// Ensure container is running (may take time if creating new container)
this.config.logger.info({ userId }, 'Ensuring user container is running');
const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning(
userId,
license
);
this.config.logger.info(
{ userId, mcpEndpoint, wasCreated },
'Container is ready'
);
// Update license with actual MCP endpoint
license.mcpServerUrl = mcpEndpoint;
const sessionId = `ws_${userId}_${Date.now()}`;
return {
userId,
channelType: ChannelType.WEBSOCKET,
channelUserId: userId, // For WebSocket, same as userId
sessionId,
license,
authenticatedAt: new Date(),
};
} catch (error) {
this.config.logger.error({ error }, 'WebSocket authentication error');
return null;
}
}
/**
* Authenticate Telegram webhook
* Also ensures the user's container is running
*/
async authenticateTelegram(telegramUserId: string): Promise<AuthContext | null> {
try {
const userId = await this.config.userService.getUserIdFromChannel(
'telegram',
telegramUserId
);
if (!userId) {
this.config.logger.warn(
{ telegramUserId },
'Telegram user not linked to platform user'
);
return null;
}
const license = await this.config.userService.getUserLicense(userId);
if (!license) {
this.config.logger.warn({ userId }, 'User license not found');
return null;
}
// Ensure container is running
this.config.logger.info({ userId }, 'Ensuring user container is running');
const { mcpEndpoint, wasCreated } = await this.config.containerManager.ensureContainerRunning(
userId,
license
);
this.config.logger.info(
{ userId, mcpEndpoint, wasCreated },
'Container is ready'
);
// Update license with actual MCP endpoint
license.mcpServerUrl = mcpEndpoint;
const sessionId = `tg_${telegramUserId}_${Date.now()}`;
return {
userId,
channelType: ChannelType.TELEGRAM,
channelUserId: telegramUserId,
sessionId,
license,
authenticatedAt: new Date(),
};
} catch (error) {
this.config.logger.error({ error }, 'Telegram authentication error');
return null;
}
}
/**
* Extract bearer token from request headers
*/
private extractBearerToken(request: FastifyRequest): string | null {
const auth = request.headers.authorization;
if (!auth || !auth.startsWith('Bearer ')) {
return null;
}
return auth.substring(7);
}
}

View File

@@ -0,0 +1,163 @@
import type { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
import type { Authenticator } from '../auth/authenticator.js';
import { AgentHarness } from '../harness/agent-harness.js';
import type { InboundMessage } from '../types/messages.js';
import { randomUUID } from 'crypto';
import type { ProviderConfig } from '../llm/provider.js';
export interface TelegramHandlerConfig {
authenticator: Authenticator;
providerConfig: ProviderConfig;
telegramBotToken: string;
}
interface TelegramUpdate {
update_id: number;
message?: {
message_id: number;
from: {
id: number;
first_name: string;
username?: string;
};
chat: {
id: number;
type: string;
};
text?: string;
photo?: Array<{
file_id: string;
file_size: number;
}>;
};
}
/**
* Telegram webhook handler
*/
export class TelegramHandler {
private config: TelegramHandlerConfig;
private sessions = new Map<string, AgentHarness>();
constructor(config: TelegramHandlerConfig) {
this.config = config;
}
/**
* Register Telegram webhook routes
*/
register(app: FastifyInstance): void {
app.post('/webhook/telegram', async (request: FastifyRequest, reply: FastifyReply) => {
await this.handleWebhook(request, reply, app);
});
}
/**
* Handle Telegram webhook
*/
private async handleWebhook(
request: FastifyRequest,
reply: FastifyReply,
app: FastifyInstance
): Promise<void> {
const logger = app.log;
try {
const update = request.body as TelegramUpdate;
if (!update.message?.text) {
// Ignore non-text messages for now
reply.code(200).send({ ok: true });
return;
}
const telegramUserId = update.message.from.id.toString();
const chatId = update.message.chat.id;
const text = update.message.text;
logger.info({ telegramUserId, chatId, text }, 'Received Telegram message');
// Authenticate
const authContext = await this.config.authenticator.authenticateTelegram(telegramUserId);
if (!authContext) {
logger.warn({ telegramUserId }, 'Telegram user not authenticated');
await this.sendTelegramMessage(
chatId,
'Please link your Telegram account to Dexorder first.'
);
reply.code(200).send({ ok: true });
return;
}
// Get or create harness
let harness = this.sessions.get(authContext.sessionId);
if (!harness) {
harness = new AgentHarness({
userId: authContext.userId,
sessionId: authContext.sessionId,
license: authContext.license,
providerConfig: this.config.providerConfig,
logger,
});
await harness.initialize();
this.sessions.set(authContext.sessionId, harness);
}
// Process message
const inboundMessage: InboundMessage = {
messageId: randomUUID(),
userId: authContext.userId,
sessionId: authContext.sessionId,
content: text,
timestamp: new Date(),
};
const response = await harness.handleMessage(inboundMessage);
// Send response back to Telegram
await this.sendTelegramMessage(chatId, response.content);
reply.code(200).send({ ok: true });
} catch (error) {
logger.error({ error }, 'Error handling Telegram webhook');
reply.code(500).send({ ok: false, error: 'Internal server error' });
}
}
/**
* Send message to Telegram chat
*/
private async sendTelegramMessage(chatId: number, text: string): Promise<void> {
const url = `https://api.telegram.org/bot${this.config.telegramBotToken}/sendMessage`;
try {
const response = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
chat_id: chatId,
text,
parse_mode: 'Markdown',
}),
});
if (!response.ok) {
throw new Error(`Telegram API error: ${response.statusText}`);
}
} catch (error) {
console.error('Failed to send Telegram message:', error);
throw error;
}
}
/**
* Cleanup old sessions (call periodically)
*/
async cleanupSessions(maxAgeMs = 30 * 60 * 1000): Promise<void> {
// TODO: Track session last activity and cleanup
// For now, sessions persist until server restart
}
}

View File

@@ -0,0 +1,161 @@
import type { FastifyInstance, FastifyRequest } from 'fastify';
import type { WebSocket } from '@fastify/websocket';
import type { Authenticator } from '../auth/authenticator.js';
import { AgentHarness } from '../harness/agent-harness.js';
import type { InboundMessage } from '../types/messages.js';
import { randomUUID } from 'crypto';
import type { ProviderConfig } from '../llm/provider.js';
export interface WebSocketHandlerConfig {
authenticator: Authenticator;
providerConfig: ProviderConfig;
}
/**
* WebSocket channel handler
*/
export class WebSocketHandler {
private config: WebSocketHandlerConfig;
private sessions = new Map<string, AgentHarness>();
constructor(config: WebSocketHandlerConfig) {
this.config = config;
}
/**
* Register WebSocket routes
*/
register(app: FastifyInstance): void {
app.get(
'/ws/chat',
{ websocket: true },
async (socket: WebSocket, request: FastifyRequest) => {
await this.handleConnection(socket, request, app);
}
);
}
/**
* Handle WebSocket connection
*/
private async handleConnection(
socket: WebSocket,
request: FastifyRequest,
app: FastifyInstance
): Promise<void> {
const logger = app.log;
// Send initial connecting message
socket.send(
JSON.stringify({
type: 'status',
status: 'authenticating',
message: 'Authenticating...',
})
);
// Authenticate (this may take time if creating container)
const authContext = await this.config.authenticator.authenticateWebSocket(request);
if (!authContext) {
logger.warn('WebSocket authentication failed');
socket.send(
JSON.stringify({
type: 'error',
message: 'Authentication failed',
})
);
socket.close(1008, 'Authentication failed');
return;
}
logger.info(
{ userId: authContext.userId, sessionId: authContext.sessionId },
'WebSocket connection authenticated'
);
// Send workspace starting message
socket.send(
JSON.stringify({
type: 'status',
status: 'initializing',
message: 'Starting your workspace...',
})
);
// Create agent harness
const harness = new AgentHarness({
userId: authContext.userId,
sessionId: authContext.sessionId,
license: authContext.license,
providerConfig: this.config.providerConfig,
logger,
});
try {
await harness.initialize();
this.sessions.set(authContext.sessionId, harness);
// Send connected message
socket.send(
JSON.stringify({
type: 'connected',
sessionId: authContext.sessionId,
userId: authContext.userId,
licenseType: authContext.license.licenseType,
message: 'Connected to Dexorder AI',
})
);
// Handle messages
socket.on('message', async (data: Buffer) => {
try {
const payload = JSON.parse(data.toString());
if (payload.type === 'message') {
const inboundMessage: InboundMessage = {
messageId: randomUUID(),
userId: authContext.userId,
sessionId: authContext.sessionId,
content: payload.content,
attachments: payload.attachments,
timestamp: new Date(),
};
const response = await harness.handleMessage(inboundMessage);
socket.send(
JSON.stringify({
type: 'message',
...response,
})
);
}
} catch (error) {
logger.error({ error }, 'Error handling WebSocket message');
socket.send(
JSON.stringify({
type: 'error',
message: 'Failed to process message',
})
);
}
});
// Handle disconnection
socket.on('close', async () => {
logger.info({ sessionId: authContext.sessionId }, 'WebSocket disconnected');
await harness.cleanup();
this.sessions.delete(authContext.sessionId);
});
socket.on('error', (error) => {
logger.error({ error, sessionId: authContext.sessionId }, 'WebSocket error');
});
} catch (error) {
logger.error({ error }, 'Failed to initialize agent harness');
socket.close(1011, 'Internal server error');
await harness.cleanup();
}
}
}

View File

@@ -0,0 +1,107 @@
import { Pool, PoolClient } from 'pg';
import type { UserLicense } from '../types/user.js';
import { UserLicenseSchema } from '../types/user.js';
export class UserService {
private pool: Pool;
constructor(connectionString: string) {
this.pool = new Pool({
connectionString,
max: 20,
idleTimeoutMillis: 30000,
connectionTimeoutMillis: 2000,
});
}
/**
* Get user license by user ID
*/
async getUserLicense(userId: string): Promise<UserLicense | null> {
const client = await this.pool.connect();
try {
const result = await client.query(
`SELECT
user_id as "userId",
email,
license_type as "licenseType",
features,
resource_limits as "resourceLimits",
mcp_server_url as "mcpServerUrl",
preferred_model as "preferredModel",
expires_at as "expiresAt",
created_at as "createdAt",
updated_at as "updatedAt"
FROM user_licenses
WHERE user_id = $1
AND (expires_at IS NULL OR expires_at > NOW())`,
[userId]
);
if (result.rows.length === 0) {
return null;
}
const row = result.rows[0];
// Parse and validate
return UserLicenseSchema.parse({
userId: row.userId,
email: row.email,
licenseType: row.licenseType,
features: row.features,
resourceLimits: row.resourceLimits,
mcpServerUrl: row.mcpServerUrl,
preferredModel: row.preferredModel,
expiresAt: row.expiresAt,
createdAt: row.createdAt,
updatedAt: row.updatedAt,
});
} finally {
client.release();
}
}
/**
* Get user ID from channel-specific identifier
*/
async getUserIdFromChannel(channelType: string, channelUserId: string): Promise<string | null> {
const client = await this.pool.connect();
try {
const result = await client.query(
`SELECT user_id
FROM user_channel_links
WHERE channel_type = $1 AND channel_user_id = $2`,
[channelType, channelUserId]
);
return result.rows.length > 0 ? result.rows[0].user_id : null;
} finally {
client.release();
}
}
/**
* Verify JWT token from web client
* TODO: Implement JWT verification with JWKS
*/
async verifyWebToken(token: string): Promise<string | null> {
// Placeholder - implement JWT verification
// For now, decode without verification (INSECURE - FOR DEV ONLY)
try {
const payload = JSON.parse(
Buffer.from(token.split('.')[1], 'base64').toString()
);
return payload.sub || null;
} catch {
return null;
}
}
/**
* Close database pool
*/
async close(): Promise<void> {
await this.pool.end();
}
}

View File

@@ -0,0 +1,306 @@
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { BaseMessage } from '@langchain/core/messages';
import { HumanMessage, AIMessage, SystemMessage } from '@langchain/core/messages';
import type { FastifyBaseLogger } from 'fastify';
import type { UserLicense } from '../types/user.js';
import type { InboundMessage, OutboundMessage } from '../types/messages.js';
import { MCPClientConnector } from './mcp-client.js';
import { CONTEXT_URIS, type ResourceContent } from '../types/resources.js';
import { LLMProviderFactory, type ProviderConfig } from '../llm/provider.js';
import { ModelRouter, RoutingStrategy } from '../llm/router.js';
export interface AgentHarnessConfig {
userId: string;
sessionId: string;
license: UserLicense;
providerConfig: ProviderConfig;
logger: FastifyBaseLogger;
}
/**
* Agent harness orchestrates between LLM and user's MCP server.
*
* This is a STATELESS orchestrator - all conversation history, RAG, and context
* lives in the user's MCP server container. The harness only:
* 1. Fetches context from user's MCP resources
* 2. Routes to appropriate LLM model
* 3. Calls LLM with embedded context
* 4. Routes tool calls to user's MCP or platform tools
* 5. Saves messages back to user's MCP
*/
export class AgentHarness {
private config: AgentHarnessConfig;
private modelFactory: LLMProviderFactory;
private modelRouter: ModelRouter;
private mcpClient: MCPClientConnector;
constructor(config: AgentHarnessConfig) {
this.config = config;
this.modelFactory = new LLMProviderFactory(config.providerConfig, config.logger);
this.modelRouter = new ModelRouter(this.modelFactory, config.logger);
this.mcpClient = new MCPClientConnector({
userId: config.userId,
mcpServerUrl: config.license.mcpServerUrl,
logger: config.logger,
});
}
/**
* Initialize harness and connect to user's MCP server
*/
async initialize(): Promise<void> {
this.config.logger.info(
{ userId: this.config.userId, sessionId: this.config.sessionId },
'Initializing agent harness'
);
try {
await this.mcpClient.connect();
this.config.logger.info('Agent harness initialized');
} catch (error) {
this.config.logger.error({ error }, 'Failed to initialize agent harness');
throw error;
}
}
/**
* Handle incoming message from user
*/
async handleMessage(message: InboundMessage): Promise<OutboundMessage> {
this.config.logger.info(
{ messageId: message.messageId, userId: message.userId },
'Processing user message'
);
try {
// 1. Fetch context resources from user's MCP server
this.config.logger.debug('Fetching context resources from MCP');
const contextResources = await this.fetchContextResources();
// 2. Build system prompt from resources
const systemPrompt = this.buildSystemPrompt(contextResources);
// 3. Build messages with conversation context from MCP
const messages = this.buildMessages(message, contextResources);
// 4. Route to appropriate model
const model = await this.modelRouter.route(
message.content,
this.config.license,
RoutingStrategy.COMPLEXITY
);
// 5. Build LangChain messages
const langchainMessages = this.buildLangChainMessages(systemPrompt, messages);
// 6. Call LLM with streaming
this.config.logger.debug('Invoking LLM');
const response = await model.invoke(langchainMessages);
// 7. Extract text response (tool handling TODO)
const assistantMessage = response.content as string;
// 8. Save messages to user's MCP server
this.config.logger.debug('Saving messages to MCP');
await this.mcpClient.callTool('save_message', {
role: 'user',
content: message.content,
timestamp: message.timestamp.toISOString(),
});
await this.mcpClient.callTool('save_message', {
role: 'assistant',
content: assistantMessage,
timestamp: new Date().toISOString(),
});
return {
messageId: `msg_${Date.now()}`,
sessionId: message.sessionId,
content: assistantMessage,
timestamp: new Date(),
};
} catch (error) {
this.config.logger.error({ error }, 'Error processing message');
throw error;
}
}
/**
* Stream response from LLM
*/
async *streamMessage(message: InboundMessage): AsyncGenerator<string> {
try {
// Fetch context
const contextResources = await this.fetchContextResources();
const systemPrompt = this.buildSystemPrompt(contextResources);
const messages = this.buildMessages(message, contextResources);
// Route to model
const model = await this.modelRouter.route(
message.content,
this.config.license,
RoutingStrategy.COMPLEXITY
);
// Build messages
const langchainMessages = this.buildLangChainMessages(systemPrompt, messages);
// Stream response
const stream = await model.stream(langchainMessages);
let fullResponse = '';
for await (const chunk of stream) {
const content = chunk.content as string;
fullResponse += content;
yield content;
}
// Save after streaming completes
await this.mcpClient.callTool('save_message', {
role: 'user',
content: message.content,
timestamp: message.timestamp.toISOString(),
});
await this.mcpClient.callTool('save_message', {
role: 'assistant',
content: fullResponse,
timestamp: new Date().toISOString(),
});
} catch (error) {
this.config.logger.error({ error }, 'Error streaming message');
throw error;
}
}
/**
* Fetch context resources from user's MCP server
*/
private async fetchContextResources(): Promise<ResourceContent[]> {
const contextUris = [
CONTEXT_URIS.USER_PROFILE,
CONTEXT_URIS.CONVERSATION_SUMMARY,
CONTEXT_URIS.WORKSPACE_STATE,
CONTEXT_URIS.SYSTEM_PROMPT,
];
const resources = await Promise.all(
contextUris.map(async (uri) => {
try {
return await this.mcpClient.readResource(uri);
} catch (error) {
this.config.logger.warn({ error, uri }, 'Failed to fetch resource, using empty');
return { uri, text: '' };
}
})
);
return resources;
}
/**
* Build messages array with context from resources
*/
private buildMessages(
currentMessage: InboundMessage,
contextResources: ResourceContent[]
): Array<{ role: string; content: string }> {
const conversationSummary = contextResources.find(
(r) => r.uri === CONTEXT_URIS.CONVERSATION_SUMMARY
);
const messages: Array<{ role: string; content: string }> = [];
// Add conversation context as a system-like user message
if (conversationSummary?.text) {
messages.push({
role: 'user',
content: `[Previous Conversation Context]\n${conversationSummary.text}`,
});
messages.push({
role: 'assistant',
content: 'I understand the context from our previous conversations.',
});
}
// Add current user message
messages.push({
role: 'user',
content: currentMessage.content,
});
return messages;
}
/**
* Convert to LangChain message format
*/
private buildLangChainMessages(
systemPrompt: string,
messages: Array<{ role: string; content: string }>
): BaseMessage[] {
const langchainMessages: BaseMessage[] = [new SystemMessage(systemPrompt)];
for (const msg of messages) {
if (msg.role === 'user') {
langchainMessages.push(new HumanMessage(msg.content));
} else if (msg.role === 'assistant') {
langchainMessages.push(new AIMessage(msg.content));
}
}
return langchainMessages;
}
/**
* Build system prompt from platform base + user resources
*/
private buildSystemPrompt(contextResources: ResourceContent[]): string {
const userProfile = contextResources.find((r) => r.uri === CONTEXT_URIS.USER_PROFILE);
const customPrompt = contextResources.find((r) => r.uri === CONTEXT_URIS.SYSTEM_PROMPT);
const workspaceState = contextResources.find((r) => r.uri === CONTEXT_URIS.WORKSPACE_STATE);
// Base platform prompt
let prompt = `You are a helpful AI assistant for Dexorder, an AI-first trading platform.
You help users research markets, develop indicators and strategies, and analyze trading data.
User license: ${this.config.license.licenseType}
Available features: ${JSON.stringify(this.config.license.features, null, 2)}`;
// Add user profile context
if (userProfile?.text) {
prompt += `\n\n# User Profile\n${userProfile.text}`;
}
// Add workspace context
if (workspaceState?.text) {
prompt += `\n\n# Current Workspace\n${workspaceState.text}`;
}
// Add user's custom instructions (highest priority)
if (customPrompt?.text) {
prompt += `\n\n# User Instructions\n${customPrompt.text}`;
}
return prompt;
}
/**
* Get platform tools (non-user-specific tools)
*/
private getPlatformTools(): Array<{ name: string; description?: string }> {
// Platform tools that don't need user's MCP
return [
// TODO: Add platform tools like market data queries, chart rendering, etc.
];
}
/**
* Cleanup resources
*/
async cleanup(): Promise<void> {
this.config.logger.info('Cleaning up agent harness');
await this.mcpClient.disconnect();
}
}

View File

@@ -0,0 +1,259 @@
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
import type { FastifyBaseLogger } from 'fastify';
export interface MCPClientConfig {
userId: string;
mcpServerUrl: string;
platformJWT?: string;
logger: FastifyBaseLogger;
}
/**
* MCP client connector for user's container
* Manages connection to user-specific MCP server
*/
export class MCPClientConnector {
private client: Client | null = null;
private connected = false;
private config: MCPClientConfig;
constructor(config: MCPClientConfig) {
this.config = config;
}
/**
* Connect to user's MCP server
* TODO: Implement HTTP/SSE transport instead of stdio for container communication
*/
async connect(): Promise<void> {
if (this.connected) {
return;
}
try {
this.config.logger.info(
{ userId: this.config.userId, url: this.config.mcpServerUrl },
'Connecting to user MCP server'
);
this.client = new Client(
{
name: 'dexorder-gateway',
version: '0.1.0',
},
{
capabilities: {
tools: {},
resources: {},
},
}
);
// TODO: Replace with HTTP transport when user containers are ready
// For now, this is a placeholder structure
// const transport = new HTTPTransport(this.config.mcpServerUrl, {
// headers: {
// 'Authorization': `Bearer ${this.config.platformJWT}`
// }
// });
// Placeholder: will be replaced with actual container transport
this.config.logger.warn(
'MCP transport not yet implemented - using placeholder'
);
this.connected = true;
this.config.logger.info('Connected to user MCP server');
} catch (error) {
this.config.logger.error(
{ error, userId: this.config.userId },
'Failed to connect to user MCP server'
);
throw error;
}
}
/**
* Call a tool on the user's MCP server
*/
async callTool(name: string, args: Record<string, unknown>): Promise<unknown> {
if (!this.client || !this.connected) {
throw new Error('MCP client not connected');
}
try {
this.config.logger.debug({ tool: name, args }, 'Calling MCP tool');
// TODO: Implement when MCP client is connected
// const result = await this.client.callTool({ name, arguments: args });
// return result;
// Placeholder response
return { success: true, message: 'MCP tool call placeholder' };
} catch (error) {
this.config.logger.error({ error, tool: name }, 'MCP tool call failed');
throw error;
}
}
/**
* List available tools from user's MCP server
*/
async listTools(): Promise<Array<{ name: string; description?: string }>> {
if (!this.client || !this.connected) {
throw new Error('MCP client not connected');
}
try {
// TODO: Implement when MCP client is connected
// const tools = await this.client.listTools();
// return tools;
// Placeholder tools (actions only, not context)
return [
{ name: 'save_message', description: 'Save message to conversation history' },
{ name: 'list_strategies', description: 'List user strategies' },
{ name: 'read_strategy', description: 'Read strategy code' },
{ name: 'write_strategy', description: 'Write strategy code' },
{ name: 'run_backtest', description: 'Run backtest on strategy' },
{ name: 'get_watchlist', description: 'Get user watchlist' },
{ name: 'execute_trade', description: 'Execute trade' },
];
} catch (error) {
this.config.logger.error({ error }, 'Failed to list MCP tools');
throw error;
}
}
/**
* List available resources from user's MCP server
*/
async listResources(): Promise<Array<{ uri: string; name: string; description?: string; mimeType?: string }>> {
if (!this.client || !this.connected) {
throw new Error('MCP client not connected');
}
try {
// TODO: Implement when MCP client is connected
// const resources = await this.client.listResources();
// return resources;
// Placeholder resources for user context
return [
{
uri: 'context://user-profile',
name: 'User Profile',
description: 'User trading style, preferences, and background',
mimeType: 'text/plain',
},
{
uri: 'context://conversation-summary',
name: 'Conversation Summary',
description: 'Semantic summary of recent conversation history with RAG',
mimeType: 'text/plain',
},
{
uri: 'context://workspace-state',
name: 'Workspace State',
description: 'Current chart, watchlist, and open positions',
mimeType: 'application/json',
},
{
uri: 'context://system-prompt',
name: 'Custom System Prompt',
description: 'User custom instructions for the assistant',
mimeType: 'text/plain',
},
];
} catch (error) {
this.config.logger.error({ error }, 'Failed to list MCP resources');
throw error;
}
}
/**
* Read a resource from user's MCP server
*/
async readResource(uri: string): Promise<{ uri: string; mimeType?: string; text?: string; blob?: string }> {
if (!this.client || !this.connected) {
throw new Error('MCP client not connected');
}
try {
this.config.logger.debug({ uri }, 'Reading MCP resource');
// TODO: Implement when MCP client is connected
// const resource = await this.client.readResource({ uri });
// return resource;
// Placeholder resource content
if (uri === 'context://user-profile') {
return {
uri,
mimeType: 'text/plain',
text: `User Profile:
- Trading experience: Intermediate
- Preferred timeframes: 1h, 4h, 1d
- Risk tolerance: Medium
- Focus: Swing trading with technical indicators`,
};
} else if (uri === 'context://conversation-summary') {
return {
uri,
mimeType: 'text/plain',
text: `Recent Conversation Summary:
[RAG-generated summary would go here]
User recently discussed:
- Moving average crossover strategies
- Backtesting on BTC/USDT
- Risk management techniques`,
};
} else if (uri === 'context://workspace-state') {
return {
uri,
mimeType: 'application/json',
text: JSON.stringify({
currentChart: { ticker: 'BINANCE:BTC/USDT', timeframe: '1h' },
watchlist: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'],
openPositions: [],
}, null, 2),
};
} else if (uri === 'context://system-prompt') {
return {
uri,
mimeType: 'text/plain',
text: `Custom Instructions:
- Be concise and data-driven
- Always show risk/reward ratios
- Prefer simple strategies over complex ones`,
};
}
return { uri, text: '' };
} catch (error) {
this.config.logger.error({ error, uri }, 'MCP resource read failed');
throw error;
}
}
/**
* Disconnect from MCP server
*/
async disconnect(): Promise<void> {
if (this.client && this.connected) {
try {
await this.client.close();
this.connected = false;
this.config.logger.info('Disconnected from user MCP server');
} catch (error) {
this.config.logger.error({ error }, 'Error disconnecting from MCP server');
}
}
}
isConnected(): boolean {
return this.connected;
}
}

327
gateway/src/k8s/client.ts Normal file
View File

@@ -0,0 +1,327 @@
import * as k8s from '@kubernetes/client-node';
import type { FastifyBaseLogger } from 'fastify';
import * as yaml from 'js-yaml';
import * as fs from 'fs/promises';
import * as path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
export interface K8sClientConfig {
namespace: string;
inCluster: boolean;
context?: string; // For local dev
logger: FastifyBaseLogger;
}
export interface DeploymentSpec {
userId: string;
licenseType: 'free' | 'pro' | 'enterprise';
agentImage: string;
sidecarImage: string;
storageClass: string;
}
/**
* Kubernetes client wrapper for managing agent deployments
*/
export class KubernetesClient {
private config: K8sClientConfig;
private k8sConfig: k8s.KubeConfig;
private appsApi: k8s.AppsV1Api;
private coreApi: k8s.CoreV1Api;
constructor(config: K8sClientConfig) {
this.config = config;
this.k8sConfig = new k8s.KubeConfig();
if (config.inCluster) {
this.k8sConfig.loadFromCluster();
this.config.logger.info('Loaded in-cluster Kubernetes config');
} else {
this.k8sConfig.loadFromDefault();
if (config.context) {
this.k8sConfig.setCurrentContext(config.context);
this.config.logger.info({ context: config.context }, 'Set Kubernetes context');
}
this.config.logger.info('Loaded Kubernetes config from default location');
}
this.appsApi = this.k8sConfig.makeApiClient(k8s.AppsV1Api);
this.coreApi = this.k8sConfig.makeApiClient(k8s.CoreV1Api);
}
/**
* Generate deployment name from user ID
*/
static getDeploymentName(userId: string): string {
// Sanitize userId to be k8s-compliant (lowercase alphanumeric + hyphens)
const sanitized = userId.toLowerCase().replace(/[^a-z0-9-]/g, '-');
return `agent-${sanitized}`;
}
/**
* Generate service name (same as deployment)
*/
static getServiceName(userId: string): string {
return this.getDeploymentName(userId);
}
/**
* Generate PVC name
*/
static getPvcName(userId: string): string {
return `${this.getDeploymentName(userId)}-data`;
}
/**
* Compute MCP endpoint URL from service name
*/
static getMcpEndpoint(userId: string, namespace: string): string {
const serviceName = this.getServiceName(userId);
return `http://${serviceName}.${namespace}.svc.cluster.local:3000`;
}
/**
* Check if deployment exists
*/
async deploymentExists(deploymentName: string): Promise<boolean> {
try {
await this.appsApi.readNamespacedDeployment(deploymentName, this.config.namespace);
return true;
} catch (error: any) {
if (error.response?.statusCode === 404) {
return false;
}
throw error;
}
}
/**
* Create agent deployment from template
*/
async createAgentDeployment(spec: DeploymentSpec): Promise<void> {
const deploymentName = KubernetesClient.getDeploymentName(spec.userId);
const serviceName = KubernetesClient.getServiceName(spec.userId);
const pvcName = KubernetesClient.getPvcName(spec.userId);
this.config.logger.info(
{ userId: spec.userId, licenseType: spec.licenseType, deploymentName },
'Creating agent deployment'
);
// Load template based on license type
const templatePath = path.join(
__dirname,
'templates',
`${spec.licenseType}-tier.yaml`
);
const templateContent = await fs.readFile(templatePath, 'utf-8');
// Substitute variables
const rendered = templateContent
.replace(/\{\{userId\}\}/g, spec.userId)
.replace(/\{\{deploymentName\}\}/g, deploymentName)
.replace(/\{\{serviceName\}\}/g, serviceName)
.replace(/\{\{pvcName\}\}/g, pvcName)
.replace(/\{\{agentImage\}\}/g, spec.agentImage)
.replace(/\{\{sidecarImage\}\}/g, spec.sidecarImage)
.replace(/\{\{storageClass\}\}/g, spec.storageClass);
// Parse YAML documents (deployment, pvc, service)
const documents = yaml.loadAll(rendered) as any[];
// Apply each resource
for (const doc of documents) {
if (!doc || !doc.kind) continue;
try {
switch (doc.kind) {
case 'Deployment':
await this.appsApi.createNamespacedDeployment(this.config.namespace, doc);
this.config.logger.info({ deploymentName }, 'Created deployment');
break;
case 'PersistentVolumeClaim':
await this.coreApi.createNamespacedPersistentVolumeClaim(
this.config.namespace,
doc
);
this.config.logger.info({ pvcName }, 'Created PVC');
break;
case 'Service':
await this.coreApi.createNamespacedService(this.config.namespace, doc);
this.config.logger.info({ serviceName }, 'Created service');
break;
default:
this.config.logger.warn({ kind: doc.kind }, 'Unknown resource kind in template');
}
} catch (error: any) {
// If resource already exists, log warning but continue
if (error.response?.statusCode === 409) {
this.config.logger.warn(
{ kind: doc.kind, name: doc.metadata?.name },
'Resource already exists, skipping'
);
} else {
throw error;
}
}
}
this.config.logger.info({ deploymentName }, 'Agent deployment created successfully');
}
/**
* Wait for deployment to be ready
*/
async waitForDeploymentReady(
deploymentName: string,
timeoutMs: number = 120000
): Promise<boolean> {
const startTime = Date.now();
const pollInterval = 2000; // 2 seconds
this.config.logger.info(
{ deploymentName, timeoutMs },
'Waiting for deployment to be ready'
);
while (Date.now() - startTime < timeoutMs) {
try {
const response = await this.appsApi.readNamespacedDeployment(
deploymentName,
this.config.namespace
);
const deployment = response.body;
const status = deployment.status;
// Check if deployment is ready
if (
status?.availableReplicas &&
status.availableReplicas > 0 &&
status.readyReplicas &&
status.readyReplicas > 0
) {
this.config.logger.info({ deploymentName }, 'Deployment is ready');
return true;
}
// Check for failure conditions
if (status?.conditions) {
const failedCondition = status.conditions.find(
(c) => c.type === 'Progressing' && c.status === 'False'
);
if (failedCondition) {
this.config.logger.error(
{ deploymentName, reason: failedCondition.reason, message: failedCondition.message },
'Deployment failed to progress'
);
return false;
}
}
this.config.logger.debug(
{
deploymentName,
replicas: status?.replicas,
ready: status?.readyReplicas,
available: status?.availableReplicas,
},
'Deployment not ready yet, waiting...'
);
await new Promise((resolve) => setTimeout(resolve, pollInterval));
} catch (error: any) {
if (error.response?.statusCode === 404) {
this.config.logger.warn({ deploymentName }, 'Deployment not found');
return false;
}
throw error;
}
}
this.config.logger.warn({ deploymentName, timeoutMs }, 'Deployment readiness timeout');
return false;
}
/**
* Get service endpoint URL
*/
async getServiceEndpoint(serviceName: string): Promise<string | null> {
try {
const response = await this.coreApi.readNamespacedService(
serviceName,
this.config.namespace
);
const service = response.body;
// For ClusterIP services, return internal DNS name
if (service.spec?.type === 'ClusterIP') {
const port = service.spec.ports?.find((p) => p.name === 'mcp')?.port || 3000;
return `http://${serviceName}.${this.config.namespace}.svc.cluster.local:${port}`;
}
// For other service types (NodePort, LoadBalancer), would need different logic
this.config.logger.warn(
{ serviceName, type: service.spec?.type },
'Unexpected service type'
);
return null;
} catch (error: any) {
if (error.response?.statusCode === 404) {
this.config.logger.warn({ serviceName }, 'Service not found');
return null;
}
throw error;
}
}
/**
* Delete deployment and associated resources
* (Used for cleanup/testing - normally handled by lifecycle sidecar)
*/
async deleteAgentDeployment(userId: string): Promise<void> {
const deploymentName = KubernetesClient.getDeploymentName(userId);
const serviceName = KubernetesClient.getServiceName(userId);
const pvcName = KubernetesClient.getPvcName(userId);
this.config.logger.info({ userId, deploymentName }, 'Deleting agent deployment');
// Delete deployment
try {
await this.appsApi.deleteNamespacedDeployment(deploymentName, this.config.namespace);
this.config.logger.info({ deploymentName }, 'Deleted deployment');
} catch (error: any) {
if (error.response?.statusCode !== 404) {
this.config.logger.warn({ deploymentName, error }, 'Failed to delete deployment');
}
}
// Delete service
try {
await this.coreApi.deleteNamespacedService(serviceName, this.config.namespace);
this.config.logger.info({ serviceName }, 'Deleted service');
} catch (error: any) {
if (error.response?.statusCode !== 404) {
this.config.logger.warn({ serviceName, error }, 'Failed to delete service');
}
}
// Delete PVC
try {
await this.coreApi.deleteNamespacedPersistentVolumeClaim(pvcName, this.config.namespace);
this.config.logger.info({ pvcName }, 'Deleted PVC');
} catch (error: any) {
if (error.response?.statusCode !== 404) {
this.config.logger.warn({ pvcName, error }, 'Failed to delete PVC');
}
}
}
}

View File

@@ -0,0 +1,118 @@
import type { FastifyBaseLogger } from 'fastify';
import { KubernetesClient, type DeploymentSpec } from './client.js';
import type { UserLicense } from '../types/user.js';
export interface ContainerManagerConfig {
k8sClient: KubernetesClient;
agentImage: string;
sidecarImage: string;
storageClass: string;
namespace: string;
logger: FastifyBaseLogger;
}
export interface ContainerStatus {
exists: boolean;
ready: boolean;
mcpEndpoint: string;
}
/**
* Container manager orchestrates agent container lifecycle
*/
export class ContainerManager {
private config: ContainerManagerConfig;
constructor(config: ContainerManagerConfig) {
this.config = config;
}
/**
* Ensure user's container is running and ready
* Returns the MCP endpoint URL
*/
async ensureContainerRunning(
userId: string,
license: UserLicense
): Promise<{ mcpEndpoint: string; wasCreated: boolean }> {
const deploymentName = KubernetesClient.getDeploymentName(userId);
const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace);
this.config.logger.info(
{ userId, licenseType: license.licenseType, deploymentName },
'Ensuring container is running'
);
// Check if deployment already exists
const exists = await this.config.k8sClient.deploymentExists(deploymentName);
if (exists) {
this.config.logger.info({ userId, deploymentName }, 'Container deployment already exists');
// Wait for it to be ready (in case it's starting up)
const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 30000);
if (!ready) {
this.config.logger.warn(
{ userId, deploymentName },
'Existing deployment not ready within timeout'
);
// Continue anyway - might be an image pull or other transient issue
}
return { mcpEndpoint, wasCreated: false };
}
// Create new deployment
this.config.logger.info({ userId, licenseType: license.licenseType }, 'Creating new container');
const spec: DeploymentSpec = {
userId,
licenseType: license.licenseType,
agentImage: this.config.agentImage,
sidecarImage: this.config.sidecarImage,
storageClass: this.config.storageClass,
};
await this.config.k8sClient.createAgentDeployment(spec);
// Wait for deployment to be ready
const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 120000);
if (!ready) {
throw new Error(
`Container deployment failed to become ready within timeout: ${deploymentName}`
);
}
this.config.logger.info({ userId, mcpEndpoint }, 'Container is ready');
return { mcpEndpoint, wasCreated: true };
}
/**
* Check container status without creating it
*/
async getContainerStatus(userId: string): Promise<ContainerStatus> {
const deploymentName = KubernetesClient.getDeploymentName(userId);
const mcpEndpoint = KubernetesClient.getMcpEndpoint(userId, this.config.namespace);
const exists = await this.config.k8sClient.deploymentExists(deploymentName);
if (!exists) {
return { exists: false, ready: false, mcpEndpoint };
}
// Check if ready (with short timeout)
const ready = await this.config.k8sClient.waitForDeploymentReady(deploymentName, 5000);
return { exists: true, ready, mcpEndpoint };
}
/**
* Delete container (for cleanup/testing)
*/
async deleteContainer(userId: string): Promise<void> {
await this.config.k8sClient.deleteAgentDeployment(userId);
}
}

View File

@@ -0,0 +1,199 @@
# Enterprise tier agent deployment template
# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
# Enterprise: No idle shutdown, larger resources
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{deploymentName}}
namespace: dexorder-agents
labels:
app.kubernetes.io/name: agent
app.kubernetes.io/component: user-agent
dexorder.io/component: agent
dexorder.io/user-id: {{userId}}
dexorder.io/deployment: {{deploymentName}}
dexorder.io/license-tier: enterprise
spec:
replicas: 1
selector:
matchLabels:
dexorder.io/user-id: {{userId}}
template:
metadata:
labels:
dexorder.io/component: agent
dexorder.io/user-id: {{userId}}
dexorder.io/deployment: {{deploymentName}}
dexorder.io/license-tier: enterprise
spec:
serviceAccountName: agent-lifecycle
shareProcessNamespace: true
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: agent
image: {{agentImage}}
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "4Gi"
cpu: "4000m"
env:
- name: USER_ID
value: {{userId}}
- name: IDLE_TIMEOUT_MINUTES
value: "0"
- name: IDLE_CHECK_INTERVAL_SECONDS
value: "60"
- name: ENABLE_IDLE_SHUTDOWN
value: "false"
- name: MCP_SERVER_PORT
value: "3000"
- name: ZMQ_CONTROL_PORT
value: "5555"
ports:
- name: mcp
containerPort: 3000
protocol: TCP
- name: zmq-control
containerPort: 5555
protocol: TCP
volumeMounts:
- name: agent-data
mountPath: /app/data
- name: tmp
mountPath: /tmp
- name: shared-run
mountPath: /var/run/agent
livenessProbe:
httpGet:
path: /health
port: mcp
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /ready
port: mcp
initialDelaySeconds: 5
periodSeconds: 10
- name: lifecycle-sidecar
image: {{sidecarImage}}
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "64Mi"
cpu: "50m"
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DEPLOYMENT_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['dexorder.io/deployment']
- name: USER_TYPE
value: "enterprise"
- name: MAIN_CONTAINER_PID
value: "1"
volumeMounts:
- name: shared-run
mountPath: /var/run/agent
readOnly: true
volumes:
- name: agent-data
persistentVolumeClaim:
claimName: {{pvcName}}
- name: tmp
emptyDir:
medium: Memory
sizeLimit: 512Mi
- name: shared-run
emptyDir:
medium: Memory
sizeLimit: 1Mi
restartPolicy: Always
terminationGracePeriodSeconds: 30
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{pvcName}}
namespace: dexorder-agents
labels:
dexorder.io/user-id: {{userId}}
dexorder.io/license-tier: enterprise
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
storageClassName: {{storageClass}}
---
apiVersion: v1
kind: Service
metadata:
name: {{serviceName}}
namespace: dexorder-agents
labels:
dexorder.io/user-id: {{userId}}
dexorder.io/license-tier: enterprise
spec:
type: ClusterIP
selector:
dexorder.io/user-id: {{userId}}
ports:
- name: mcp
port: 3000
targetPort: mcp
protocol: TCP
- name: zmq-control
port: 5555
targetPort: zmq-control
protocol: TCP

View File

@@ -0,0 +1,198 @@
# Free tier agent deployment template
# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{deploymentName}}
namespace: dexorder-agents
labels:
app.kubernetes.io/name: agent
app.kubernetes.io/component: user-agent
dexorder.io/component: agent
dexorder.io/user-id: {{userId}}
dexorder.io/deployment: {{deploymentName}}
dexorder.io/license-tier: free
spec:
replicas: 1
selector:
matchLabels:
dexorder.io/user-id: {{userId}}
template:
metadata:
labels:
dexorder.io/component: agent
dexorder.io/user-id: {{userId}}
dexorder.io/deployment: {{deploymentName}}
dexorder.io/license-tier: free
spec:
serviceAccountName: agent-lifecycle
shareProcessNamespace: true
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: agent
image: {{agentImage}}
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
env:
- name: USER_ID
value: {{userId}}
- name: IDLE_TIMEOUT_MINUTES
value: "15"
- name: IDLE_CHECK_INTERVAL_SECONDS
value: "60"
- name: ENABLE_IDLE_SHUTDOWN
value: "true"
- name: MCP_SERVER_PORT
value: "3000"
- name: ZMQ_CONTROL_PORT
value: "5555"
ports:
- name: mcp
containerPort: 3000
protocol: TCP
- name: zmq-control
containerPort: 5555
protocol: TCP
volumeMounts:
- name: agent-data
mountPath: /app/data
- name: tmp
mountPath: /tmp
- name: shared-run
mountPath: /var/run/agent
livenessProbe:
httpGet:
path: /health
port: mcp
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /ready
port: mcp
initialDelaySeconds: 5
periodSeconds: 10
- name: lifecycle-sidecar
image: {{sidecarImage}}
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "64Mi"
cpu: "50m"
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DEPLOYMENT_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['dexorder.io/deployment']
- name: USER_TYPE
value: "free"
- name: MAIN_CONTAINER_PID
value: "1"
volumeMounts:
- name: shared-run
mountPath: /var/run/agent
readOnly: true
volumes:
- name: agent-data
persistentVolumeClaim:
claimName: {{pvcName}}
- name: tmp
emptyDir:
medium: Memory
sizeLimit: 128Mi
- name: shared-run
emptyDir:
medium: Memory
sizeLimit: 1Mi
restartPolicy: Always
terminationGracePeriodSeconds: 30
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{pvcName}}
namespace: dexorder-agents
labels:
dexorder.io/user-id: {{userId}}
dexorder.io/license-tier: free
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: {{storageClass}}
---
apiVersion: v1
kind: Service
metadata:
name: {{serviceName}}
namespace: dexorder-agents
labels:
dexorder.io/user-id: {{userId}}
dexorder.io/license-tier: free
spec:
type: ClusterIP
selector:
dexorder.io/user-id: {{userId}}
ports:
- name: mcp
port: 3000
targetPort: mcp
protocol: TCP
- name: zmq-control
port: 5555
targetPort: zmq-control
protocol: TCP

View File

@@ -0,0 +1,198 @@
# Pro tier agent deployment template
# Variables: {{userId}}, {{deploymentName}}, {{pvcName}}, {{serviceName}}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{deploymentName}}
namespace: dexorder-agents
labels:
app.kubernetes.io/name: agent
app.kubernetes.io/component: user-agent
dexorder.io/component: agent
dexorder.io/user-id: {{userId}}
dexorder.io/deployment: {{deploymentName}}
dexorder.io/license-tier: pro
spec:
replicas: 1
selector:
matchLabels:
dexorder.io/user-id: {{userId}}
template:
metadata:
labels:
dexorder.io/component: agent
dexorder.io/user-id: {{userId}}
dexorder.io/deployment: {{deploymentName}}
dexorder.io/license-tier: pro
spec:
serviceAccountName: agent-lifecycle
shareProcessNamespace: true
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers:
- name: agent
image: {{agentImage}}
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "2000m"
env:
- name: USER_ID
value: {{userId}}
- name: IDLE_TIMEOUT_MINUTES
value: "60"
- name: IDLE_CHECK_INTERVAL_SECONDS
value: "60"
- name: ENABLE_IDLE_SHUTDOWN
value: "true"
- name: MCP_SERVER_PORT
value: "3000"
- name: ZMQ_CONTROL_PORT
value: "5555"
ports:
- name: mcp
containerPort: 3000
protocol: TCP
- name: zmq-control
containerPort: 5555
protocol: TCP
volumeMounts:
- name: agent-data
mountPath: /app/data
- name: tmp
mountPath: /tmp
- name: shared-run
mountPath: /var/run/agent
livenessProbe:
httpGet:
path: /health
port: mcp
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /ready
port: mcp
initialDelaySeconds: 5
periodSeconds: 10
- name: lifecycle-sidecar
image: {{sidecarImage}}
imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "64Mi"
cpu: "50m"
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DEPLOYMENT_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['dexorder.io/deployment']
- name: USER_TYPE
value: "pro"
- name: MAIN_CONTAINER_PID
value: "1"
volumeMounts:
- name: shared-run
mountPath: /var/run/agent
readOnly: true
volumes:
- name: agent-data
persistentVolumeClaim:
claimName: {{pvcName}}
- name: tmp
emptyDir:
medium: Memory
sizeLimit: 256Mi
- name: shared-run
emptyDir:
medium: Memory
sizeLimit: 1Mi
restartPolicy: Always
terminationGracePeriodSeconds: 30
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{pvcName}}
namespace: dexorder-agents
labels:
dexorder.io/user-id: {{userId}}
dexorder.io/license-tier: pro
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: {{storageClass}}
---
apiVersion: v1
kind: Service
metadata:
name: {{serviceName}}
namespace: dexorder-agents
labels:
dexorder.io/user-id: {{userId}}
dexorder.io/license-tier: pro
spec:
type: ClusterIP
selector:
dexorder.io/user-id: {{userId}}
ports:
- name: mcp
port: 3000
targetPort: mcp
protocol: TCP
- name: zmq-control
port: 5555
targetPort: zmq-control
protocol: TCP

216
gateway/src/llm/provider.ts Normal file
View File

@@ -0,0 +1,216 @@
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import { ChatAnthropic } from '@langchain/anthropic';
import { ChatOpenAI } from '@langchain/openai';
import { ChatGoogleGenerativeAI } from '@langchain/google-genai';
import { ChatOpenRouter } from '@langchain/openrouter';
import type { FastifyBaseLogger } from 'fastify';
/**
* Supported LLM providers
*/
export enum LLMProvider {
ANTHROPIC = 'anthropic',
OPENAI = 'openai',
GOOGLE = 'google',
OPENROUTER = 'openrouter',
}
/**
* Model configuration
*/
export interface ModelConfig {
provider: LLMProvider;
model: string;
temperature?: number;
maxTokens?: number;
}
/**
* Provider configuration with API keys
*/
export interface ProviderConfig {
anthropicApiKey?: string;
openaiApiKey?: string;
googleApiKey?: string;
openrouterApiKey?: string;
}
/**
* LLM Provider factory
* Creates model instances with unified interface across providers
*/
export class LLMProviderFactory {
private config: ProviderConfig;
private logger: FastifyBaseLogger;
constructor(config: ProviderConfig, logger: FastifyBaseLogger) {
this.config = config;
this.logger = logger;
}
/**
* Create a chat model instance
*/
createModel(modelConfig: ModelConfig): BaseChatModel {
this.logger.debug(
{ provider: modelConfig.provider, model: modelConfig.model },
'Creating LLM model'
);
switch (modelConfig.provider) {
case LLMProvider.ANTHROPIC:
return this.createAnthropicModel(modelConfig);
case LLMProvider.OPENAI:
return this.createOpenAIModel(modelConfig);
case LLMProvider.GOOGLE:
return this.createGoogleModel(modelConfig);
case LLMProvider.OPENROUTER:
return this.createOpenRouterModel(modelConfig);
default:
throw new Error(`Unsupported provider: ${modelConfig.provider}`);
}
}
/**
* Create Anthropic Claude model
*/
private createAnthropicModel(config: ModelConfig): ChatAnthropic {
if (!this.config.anthropicApiKey) {
throw new Error('Anthropic API key not configured');
}
return new ChatAnthropic({
model: config.model,
temperature: config.temperature ?? 0.7,
maxTokens: config.maxTokens ?? 4096,
anthropicApiKey: this.config.anthropicApiKey,
});
}
/**
* Create OpenAI GPT model
*/
private createOpenAIModel(config: ModelConfig): ChatOpenAI {
if (!this.config.openaiApiKey) {
throw new Error('OpenAI API key not configured');
}
return new ChatOpenAI({
model: config.model,
temperature: config.temperature ?? 0.7,
maxTokens: config.maxTokens ?? 4096,
openAIApiKey: this.config.openaiApiKey,
});
}
/**
* Create Google Gemini model
*/
private createGoogleModel(config: ModelConfig): ChatGoogleGenerativeAI {
if (!this.config.googleApiKey) {
throw new Error('Google API key not configured');
}
return new ChatGoogleGenerativeAI({
model: config.model,
temperature: config.temperature ?? 0.7,
maxOutputTokens: config.maxTokens ?? 4096,
apiKey: this.config.googleApiKey,
});
}
/**
* Create OpenRouter model (access to 300+ models)
*/
private createOpenRouterModel(config: ModelConfig): ChatOpenRouter {
if (!this.config.openrouterApiKey) {
throw new Error('OpenRouter API key not configured');
}
return new ChatOpenRouter({
model: config.model,
temperature: config.temperature ?? 0.7,
maxTokens: config.maxTokens ?? 4096,
apiKey: this.config.openrouterApiKey,
});
}
/**
* Get default model based on environment
*/
getDefaultModel(): ModelConfig {
// Check which API keys are available
if (this.config.anthropicApiKey) {
return {
provider: LLMProvider.ANTHROPIC,
model: 'claude-3-5-sonnet-20241022',
};
}
if (this.config.openaiApiKey) {
return {
provider: LLMProvider.OPENAI,
model: 'gpt-4o',
};
}
if (this.config.googleApiKey) {
return {
provider: LLMProvider.GOOGLE,
model: 'gemini-2.0-flash-exp',
};
}
if (this.config.openrouterApiKey) {
return {
provider: LLMProvider.OPENROUTER,
model: 'anthropic/claude-3.5-sonnet',
};
}
throw new Error('No LLM API keys configured');
}
}
/**
* Predefined model configurations
*/
export const MODELS = {
// Anthropic
CLAUDE_SONNET: {
provider: LLMProvider.ANTHROPIC,
model: 'claude-3-5-sonnet-20241022',
},
CLAUDE_HAIKU: {
provider: LLMProvider.ANTHROPIC,
model: 'claude-3-5-haiku-20241022',
},
CLAUDE_OPUS: {
provider: LLMProvider.ANTHROPIC,
model: 'claude-3-opus-20240229',
},
// OpenAI
GPT4O: {
provider: LLMProvider.OPENAI,
model: 'gpt-4o',
},
GPT4O_MINI: {
provider: LLMProvider.OPENAI,
model: 'gpt-4o-mini',
},
// Google
GEMINI_2_FLASH: {
provider: LLMProvider.GOOGLE,
model: 'gemini-2.0-flash-exp',
},
GEMINI_PRO: {
provider: LLMProvider.GOOGLE,
model: 'gemini-1.5-pro',
},
} as const satisfies Record<string, ModelConfig>;

202
gateway/src/llm/router.ts Normal file
View File

@@ -0,0 +1,202 @@
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { FastifyBaseLogger } from 'fastify';
import { LLMProviderFactory, type ModelConfig, LLMProvider } from './provider.js';
import type { UserLicense } from '../types/user.js';
/**
* Model routing strategies
*/
export enum RoutingStrategy {
/** Use user's preferred model from license */
USER_PREFERENCE = 'user_preference',
/** Route based on query complexity */
COMPLEXITY = 'complexity',
/** Route based on license tier */
LICENSE_TIER = 'license_tier',
/** Use cheapest available model */
COST_OPTIMIZED = 'cost_optimized',
}
/**
* Model router
* Intelligently selects which model to use based on various factors
*/
export class ModelRouter {
private factory: LLMProviderFactory;
private logger: FastifyBaseLogger;
private defaultModel: ModelConfig;
constructor(factory: LLMProviderFactory, logger: FastifyBaseLogger) {
this.factory = factory;
this.logger = logger;
this.defaultModel = factory.getDefaultModel();
}
/**
* Route to appropriate model based on context
*/
async route(
message: string,
license: UserLicense,
strategy: RoutingStrategy = RoutingStrategy.USER_PREFERENCE
): Promise<BaseChatModel> {
let modelConfig: ModelConfig;
switch (strategy) {
case RoutingStrategy.USER_PREFERENCE:
modelConfig = this.routeByUserPreference(license);
break;
case RoutingStrategy.COMPLEXITY:
modelConfig = this.routeByComplexity(message, license);
break;
case RoutingStrategy.LICENSE_TIER:
modelConfig = this.routeByLicenseTier(license);
break;
case RoutingStrategy.COST_OPTIMIZED:
modelConfig = this.routeByCost(license);
break;
default:
modelConfig = this.defaultModel;
}
this.logger.info(
{
userId: license.userId,
strategy,
provider: modelConfig.provider,
model: modelConfig.model,
},
'Routing to model'
);
return this.factory.createModel(modelConfig);
}
/**
* Route based on user's preferred model (if set in license)
*/
private routeByUserPreference(license: UserLicense): ModelConfig {
// Check if user has custom model preference
const preferredModel = (license as any).preferredModel as ModelConfig | undefined;
if (preferredModel && this.isModelAllowed(preferredModel, license)) {
return preferredModel;
}
// Fall back to license tier default
return this.routeByLicenseTier(license);
}
/**
* Route based on query complexity
*/
private routeByComplexity(message: string, license: UserLicense): ModelConfig {
const isComplex = this.isComplexQuery(message);
if (license.licenseType === 'enterprise') {
// Enterprise users get best models for complex queries
return isComplex
? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-opus-20240229' }
: { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' };
}
if (license.licenseType === 'pro') {
// Pro users get good models
return isComplex
? { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' }
: { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' };
}
// Free users get efficient models
return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
}
/**
* Route based on license tier
*/
private routeByLicenseTier(license: UserLicense): ModelConfig {
switch (license.licenseType) {
case 'enterprise':
return { provider: LLMProvider.ANTHROPIC, model: 'claude-3-5-sonnet-20241022' };
case 'pro':
return { provider: LLMProvider.OPENAI, model: 'gpt-4o' };
case 'free':
return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
default:
return this.defaultModel;
}
}
/**
* Route to cheapest available model
*/
private routeByCost(license: UserLicense): ModelConfig {
// Free tier: use cheapest
if (license.licenseType === 'free') {
return { provider: LLMProvider.GOOGLE, model: 'gemini-2.0-flash-exp' };
}
// Paid tiers: use GPT-4o-mini for cost efficiency
return { provider: LLMProvider.OPENAI, model: 'gpt-4o-mini' };
}
/**
* Check if model is allowed for user's license
*/
private isModelAllowed(model: ModelConfig, license: UserLicense): boolean {
// Free tier: only cheap models
if (license.licenseType === 'free') {
const allowedModels = ['gemini-2.0-flash-exp', 'gpt-4o-mini', 'claude-3-5-haiku-20241022'];
return allowedModels.includes(model.model);
}
// Pro: all except Opus
if (license.licenseType === 'pro') {
const blockedModels = ['claude-3-opus-20240229'];
return !blockedModels.includes(model.model);
}
// Enterprise: all models allowed
return true;
}
/**
* Determine if query is complex
*/
private isComplexQuery(message: string): boolean {
const complexityIndicators = [
// Multi-step analysis
'backtest',
'analyze',
'compare',
'optimize',
// Code generation
'write',
'create',
'implement',
'build',
// Deep reasoning
'explain why',
'what if',
'how would',
// Long messages (> 200 chars likely complex)
message.length > 200,
];
const messageLower = message.toLowerCase();
return complexityIndicators.some((indicator) =>
typeof indicator === 'string' ? messageLower.includes(indicator) : indicator
);
}
}

154
gateway/src/main.ts Normal file
View File

@@ -0,0 +1,154 @@
import Fastify from 'fastify';
import websocket from '@fastify/websocket';
import cors from '@fastify/cors';
import { UserService } from './db/user-service.js';
import { Authenticator } from './auth/authenticator.js';
import { WebSocketHandler } from './channels/websocket-handler.js';
import { TelegramHandler } from './channels/telegram-handler.js';
import { KubernetesClient } from './k8s/client.js';
import { ContainerManager } from './k8s/container-manager.js';
const app = Fastify({
logger: {
level: process.env.LOG_LEVEL || 'info',
transport: {
target: 'pino-pretty',
options: {
colorize: true,
translateTime: 'HH:MM:ss Z',
ignore: 'pid,hostname',
},
},
},
});
// Configuration from environment
const config = {
port: parseInt(process.env.PORT || '3000'),
host: process.env.HOST || '0.0.0.0',
databaseUrl: process.env.DATABASE_URL || 'postgresql://localhost/dexorder',
// LLM provider API keys
providerConfig: {
anthropicApiKey: process.env.ANTHROPIC_API_KEY,
openaiApiKey: process.env.OPENAI_API_KEY,
googleApiKey: process.env.GOOGLE_API_KEY,
openrouterApiKey: process.env.OPENROUTER_API_KEY,
},
telegramBotToken: process.env.TELEGRAM_BOT_TOKEN || '',
// Kubernetes configuration
kubernetes: {
namespace: process.env.KUBERNETES_NAMESPACE || 'dexorder-agents',
inCluster: process.env.KUBERNETES_IN_CLUSTER === 'true',
context: process.env.KUBERNETES_CONTEXT,
agentImage: process.env.AGENT_IMAGE || 'ghcr.io/dexorder/agent:latest',
sidecarImage: process.env.SIDECAR_IMAGE || 'ghcr.io/dexorder/lifecycle-sidecar:latest',
storageClass: process.env.AGENT_STORAGE_CLASS || 'standard',
},
};
// Validate at least one LLM provider is configured
const hasAnyProvider = Object.values(config.providerConfig).some(key => !!key);
if (!hasAnyProvider) {
app.log.error('At least one LLM provider API key is required (ANTHROPIC_API_KEY, OPENAI_API_KEY, GOOGLE_API_KEY, or OPENROUTER_API_KEY)');
process.exit(1);
}
// Register plugins
await app.register(cors, {
origin: process.env.CORS_ORIGIN || '*',
});
await app.register(websocket, {
options: {
maxPayload: 1024 * 1024, // 1MB
},
});
// Initialize services
const userService = new UserService(config.databaseUrl);
// Initialize Kubernetes client and container manager
const k8sClient = new KubernetesClient({
namespace: config.kubernetes.namespace,
inCluster: config.kubernetes.inCluster,
context: config.kubernetes.context,
logger: app.log,
});
const containerManager = new ContainerManager({
k8sClient,
agentImage: config.kubernetes.agentImage,
sidecarImage: config.kubernetes.sidecarImage,
storageClass: config.kubernetes.storageClass,
namespace: config.kubernetes.namespace,
logger: app.log,
});
const authenticator = new Authenticator({
userService,
containerManager,
logger: app.log,
});
// Initialize channel handlers
const websocketHandler = new WebSocketHandler({
authenticator,
providerConfig: config.providerConfig,
});
const telegramHandler = new TelegramHandler({
authenticator,
providerConfig: config.providerConfig,
telegramBotToken: config.telegramBotToken,
});
// Register routes
websocketHandler.register(app);
telegramHandler.register(app);
// Health check
app.get('/health', async () => {
return {
status: 'ok',
timestamp: new Date().toISOString(),
};
});
// Graceful shutdown
const shutdown = async () => {
app.log.info('Shutting down gracefully...');
try {
await userService.close();
await app.close();
app.log.info('Shutdown complete');
process.exit(0);
} catch (error) {
app.log.error({ error }, 'Error during shutdown');
process.exit(1);
}
};
process.on('SIGTERM', shutdown);
process.on('SIGINT', shutdown);
// Start server
try {
await app.listen({
port: config.port,
host: config.host,
});
app.log.info(
{
port: config.port,
host: config.host,
},
'Gateway server started'
);
} catch (error) {
app.log.error({ error }, 'Failed to start server');
process.exit(1);
}

View File

@@ -0,0 +1,37 @@
import { z } from 'zod';
/**
* Inbound user message from any channel
*/
export const InboundMessageSchema = z.object({
messageId: z.string(),
userId: z.string(),
sessionId: z.string(),
content: z.string(),
attachments: z.array(z.object({
type: z.enum(['image', 'file', 'url']),
url: z.string(),
mimeType: z.string().optional(),
})).optional(),
timestamp: z.date(),
});
export type InboundMessage = z.infer<typeof InboundMessageSchema>;
/**
* Outbound response to channel
*/
export const OutboundMessageSchema = z.object({
messageId: z.string(),
sessionId: z.string(),
content: z.string(),
attachments: z.array(z.object({
type: z.enum(['image', 'chart', 'file']),
url: z.string(),
caption: z.string().optional(),
})).optional(),
metadata: z.record(z.unknown()).optional(),
timestamp: z.date(),
});
export type OutboundMessage = z.infer<typeof OutboundMessageSchema>;

View File

@@ -0,0 +1,101 @@
import { z } from 'zod';
/**
* MCP Resource types for user context
*/
/**
* Base resource structure from MCP server
*/
export const MCPResourceSchema = z.object({
uri: z.string(),
mimeType: z.string().optional(),
text: z.string().optional(),
blob: z.string().optional(), // base64 encoded
});
export type MCPResource = z.infer<typeof MCPResourceSchema>;
/**
* User profile context
*/
export const UserProfileContextSchema = z.object({
tradingExperience: z.enum(['beginner', 'intermediate', 'advanced', 'professional']),
preferredTimeframes: z.array(z.string()),
riskTolerance: z.enum(['low', 'medium', 'high']),
tradingStyle: z.string(),
favoriteIndicators: z.array(z.string()).optional(),
activeTradingPairs: z.array(z.string()).optional(),
notes: z.string().optional(),
});
export type UserProfileContext = z.infer<typeof UserProfileContextSchema>;
/**
* Workspace state (current chart, positions, etc.)
*/
export const WorkspaceStateSchema = z.object({
currentChart: z.object({
ticker: z.string(),
timeframe: z.string(),
indicators: z.array(z.string()).optional(),
}).optional(),
watchlist: z.array(z.string()),
openPositions: z.array(z.object({
ticker: z.string(),
side: z.enum(['long', 'short']),
size: z.number(),
entryPrice: z.number(),
currentPrice: z.number().optional(),
unrealizedPnL: z.number().optional(),
})),
recentAlerts: z.array(z.object({
type: z.string(),
message: z.string(),
timestamp: z.string(),
})).optional(),
});
export type WorkspaceState = z.infer<typeof WorkspaceStateSchema>;
/**
* Standard context resource URIs
*/
export const CONTEXT_URIS = {
USER_PROFILE: 'context://user-profile',
CONVERSATION_SUMMARY: 'context://conversation-summary',
WORKSPACE_STATE: 'context://workspace-state',
SYSTEM_PROMPT: 'context://system-prompt',
} as const;
/**
* Resource content interface
*/
export interface ResourceContent {
uri: string;
mimeType?: string;
text?: string;
blob?: string;
}
/**
* Helper to parse resource content
*/
export function parseResource<T>(resource: ResourceContent, schema: z.ZodSchema<T>): T | null {
if (!resource.text) {
return null;
}
try {
// Try JSON parsing if mime type is JSON
if (resource.mimeType?.includes('json')) {
const data = JSON.parse(resource.text);
return schema.parse(data);
}
// Otherwise return as-is for text resources
return resource.text as T;
} catch {
return null;
}
}

66
gateway/src/types/user.ts Normal file
View File

@@ -0,0 +1,66 @@
import { z } from 'zod';
/**
* Model preference configuration
*/
export const ModelPreferenceSchema = z.object({
provider: z.enum(['anthropic', 'openai', 'google', 'openrouter']),
model: z.string(),
temperature: z.number().optional(),
});
export type ModelPreference = z.infer<typeof ModelPreferenceSchema>;
/**
* User license and feature authorization
*/
export const UserLicenseSchema = z.object({
userId: z.string(),
email: z.string().email().optional(),
licenseType: z.enum(['free', 'pro', 'enterprise']),
features: z.object({
maxIndicators: z.number(),
maxStrategies: z.number(),
maxBacktestDays: z.number(),
realtimeData: z.boolean(),
customExecutors: z.boolean(),
apiAccess: z.boolean(),
}),
resourceLimits: z.object({
maxConcurrentSessions: z.number(),
maxMessagesPerDay: z.number(),
maxTokensPerMessage: z.number(),
rateLimitPerMinute: z.number(),
}),
mcpServerUrl: z.string().url(),
preferredModel: ModelPreferenceSchema.optional(),
expiresAt: z.date().optional(),
createdAt: z.date(),
updatedAt: z.date(),
});
export type UserLicense = z.infer<typeof UserLicenseSchema>;
/**
* Channel types for multi-channel support
*/
export enum ChannelType {
WEBSOCKET = 'websocket',
TELEGRAM = 'telegram',
SLACK = 'slack',
DISCORD = 'discord',
}
/**
* Authentication context per channel
*/
export const AuthContextSchema = z.object({
userId: z.string(),
channelType: z.nativeEnum(ChannelType),
channelUserId: z.string(), // Platform-specific ID (telegram_id, discord_id, etc)
sessionId: z.string(),
license: UserLicenseSchema,
authenticatedAt: z.date(),
});
export type AuthContext = z.infer<typeof AuthContextSchema>;

View File

@@ -0,0 +1,253 @@
# LangGraph Workflows for Trading
Complex, stateful workflows built with LangGraph for trading-specific tasks.
## Overview
LangGraph provides:
- **Stateful execution**: Workflow state persists across failures
- **Conditional branching**: Route based on market conditions, backtest results, etc.
- **Human-in-the-loop**: Pause for user approval before executing trades
- **Loops & retries**: Backtest with different parameters, retry failed operations
- **Multi-agent**: Different LLMs for different tasks (analysis, risk, execution)
## Workflows
### Strategy Analysis (`strategy-analysis.ts`)
Multi-step pipeline for analyzing trading strategies:
```typescript
import { buildStrategyAnalysisWorkflow } from './workflows/strategy-analysis.js';
const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn);
const result = await workflow.invoke({
strategyCode: userStrategy,
ticker: 'BTC/USDT',
timeframe: '1h',
});
console.log(result.recommendation); // Go/no-go decision
```
**Steps:**
1. **Code Review** - LLM analyzes strategy code for bugs, logic errors
2. **Backtest** - Runs backtest via user's MCP server
3. **Risk Assessment** - LLM evaluates results (drawdown, Sharpe, etc.)
4. **Human Approval** - Pauses for user review
5. **Recommendation** - Final go/no-go decision
**Benefits:**
- Stateful: Can resume if server restarts
- Human-in-the-loop: User must approve before deployment
- Multi-step reasoning: Each step builds on previous
---
## Future Workflows
### Market Scanner
Scan multiple tickers for trading opportunities:
```typescript
const scanner = buildMarketScannerWorkflow(model, logger);
const result = await scanner.invoke({
tickers: ['BTC/USDT', 'ETH/USDT', 'SOL/USDT'],
strategies: ['momentum', 'mean_reversion'],
timeframe: '1h',
});
// Returns ranked opportunities
```
**Steps:**
1. **Fetch Data** - Get OHLC for all tickers
2. **Apply Strategies** - Run each strategy on each ticker (parallel)
3. **Rank Signals** - Score by confidence, risk/reward
4. **Filter** - Apply user's risk limits
5. **Return Top N** - Best opportunities
---
### Portfolio Optimization
Optimize position sizing across multiple strategies:
```typescript
const optimizer = buildPortfolioOptimizerWorkflow(model, logger);
const result = await optimizer.invoke({
strategies: [strategy1, strategy2, strategy3],
totalCapital: 100000,
maxRiskPerTrade: 0.02,
});
// Returns optimal allocation
```
**Steps:**
1. **Backtest All** - Run backtests for each strategy
2. **Correlation Analysis** - Check strategy correlation
3. **Monte Carlo** - Simulate portfolio performance
4. **Optimize** - Find optimal weights (Sharpe maximization)
5. **Risk Check** - Validate against user limits
---
### Trade Execution Monitor
Monitor trade execution and adapt to market conditions:
```typescript
const monitor = buildTradeExecutionWorkflow(model, logger, exchange);
const result = await monitor.invoke({
tradeId: 'xyz',
targetPrice: 45000,
maxSlippage: 0.001,
timeLimit: 60, // seconds
});
```
**Steps:**
1. **Place Order** - Submit order to exchange
2. **Monitor Fill** - Check fill status every second
3. **Adapt** - If not filling, adjust price (within slippage)
4. **Retry Logic** - If rejected, retry with backoff
5. **Timeout** - Cancel if time limit exceeded
6. **Report** - Final execution report
---
## Using Workflows in Gateway
### Simple Chat vs Complex Workflow
```typescript
// gateway/src/orchestrator.ts
export class MessageOrchestrator {
async handleMessage(msg: InboundMessage) {
// Route based on complexity
if (this.isSimpleQuery(msg)) {
// Use agent harness for streaming chat
return this.harness.streamMessage(msg);
}
if (this.isWorkflowRequest(msg)) {
// Use LangGraph for complex analysis
return this.executeWorkflow(msg);
}
}
async executeWorkflow(msg: InboundMessage) {
const { type, params } = this.parseWorkflowRequest(msg);
switch (type) {
case 'analyze_strategy':
const workflow = buildStrategyAnalysisWorkflow(...);
return await workflow.invoke(params);
case 'scan_market':
const scanner = buildMarketScannerWorkflow(...);
return await scanner.invoke(params);
// ... more workflows
}
}
}
```
---
## Benefits for Trading
### vs Simple LLM Calls
| Scenario | Simple LLM | LangGraph Workflow |
|----------|-----------|-------------------|
| "What's the RSI?" | ✅ Fast, streaming | ❌ Overkill |
| "Analyze this strategy" | ❌ Limited context | ✅ Multi-step analysis |
| "Backtest 10 param combos" | ❌ No loops | ✅ Conditional loops |
| "Execute if approved" | ❌ No state | ✅ Human-in-the-loop |
| Server crashes mid-analysis | ❌ Lost progress | ✅ Resume from checkpoint |
### When to Use Workflows
**Use LangGraph when:**
- Multi-step analysis (backtest → risk → approval)
- Conditional logic (if bullish → momentum, else → mean-reversion)
- Human approval required (pause workflow)
- Loops needed (try different parameters)
- Long-running (can survive restarts)
**Use Agent Harness when:**
- Simple Q&A ("What is RSI?")
- Fast response needed (streaming chat)
- Single tool call ("Get my watchlist")
- Real-time interaction (Telegram, WebSocket)
---
## Implementation Notes
### State Persistence
LangGraph can persist state to database:
```typescript
import { MemorySaver } from '@langchain/langgraph';
const checkpointer = new MemorySaver();
const workflow = graph.compile({ checkpointer });
// Resume from checkpoint
const result = await workflow.invoke(input, {
configurable: { thread_id: 'user-123-strategy-analysis' }
});
```
### Human-in-the-Loop
Pause workflow for user input:
```typescript
const workflow = graph
.addNode('human_approval', humanApprovalNode)
.interrupt('human_approval'); // Pauses here
// User reviews in UI
const approved = await getUserApproval(workflowId);
// Resume workflow
await workflow.resume(state, { approved });
```
### Multi-Agent
Use different models for different tasks:
```typescript
const analysisModel = new ChatAnthropic({ model: 'claude-3-opus' }); // Smart
const codeModel = new ChatOpenAI({ model: 'gpt-4o' }); // Good at code
const cheapModel = new ChatOpenAI({ model: 'gpt-4o-mini' }); // Fast
const workflow = graph
.addNode('analyze', (state) => analysisModel.invoke(...))
.addNode('code_review', (state) => codeModel.invoke(...))
.addNode('summarize', (state) => cheapModel.invoke(...));
```
---
## Next Steps
1. Implement remaining workflows (scanner, optimizer, execution)
2. Add state persistence (PostgreSQL checkpointer)
3. Integrate human-in-the-loop with WebSocket
4. Add workflow monitoring dashboard
5. Performance optimization (parallel execution)

View File

@@ -0,0 +1,162 @@
import { StateGraph, Annotation } from '@langchain/langgraph';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import { HumanMessage, SystemMessage } from '@langchain/core/messages';
import type { FastifyBaseLogger } from 'fastify';
/**
* State for strategy analysis workflow
*/
const StrategyAnalysisState = Annotation.Root({
strategyCode: Annotation<string>(),
ticker: Annotation<string>(),
timeframe: Annotation<string>(),
// Analysis steps
codeReview: Annotation<string | null>({
default: () => null,
}),
backtestResults: Annotation<Record<string, unknown> | null>({
default: () => null,
}),
riskAssessment: Annotation<string | null>({
default: () => null,
}),
humanApproved: Annotation<boolean>({
default: () => false,
}),
// Final output
recommendation: Annotation<string | null>({
default: () => null,
}),
});
type StrategyAnalysisStateType = typeof StrategyAnalysisState.State;
/**
* Build strategy analysis workflow using LangGraph
*
* Workflow steps:
* 1. Code review (LLM analyzes strategy code)
* 2. Backtest (calls user's MCP backtest tool)
* 3. Risk assessment (LLM evaluates results)
* 4. Human approval (pause for user review)
* 5. Final recommendation
*/
export function buildStrategyAnalysisWorkflow(
model: BaseChatModel,
logger: FastifyBaseLogger,
mcpBacktestFn: (strategy: string, ticker: string, timeframe: string) => Promise<Record<string, unknown>>
) {
// Node: Code Review
const codeReviewNode = async (state: StrategyAnalysisStateType) => {
logger.info('Strategy workflow: Code review');
const systemPrompt = `You are an expert trading strategy analyst.
Review the following strategy code for potential issues, bugs, or improvements.
Focus on: logic errors, edge cases, performance, and trading best practices.`;
const response = await model.invoke([
new SystemMessage(systemPrompt),
new HumanMessage(`Review this strategy:\n\n${state.strategyCode}`),
]);
return {
codeReview: response.content as string,
};
};
// Node: Backtest
const backtestNode = async (state: StrategyAnalysisStateType) => {
logger.info('Strategy workflow: Running backtest');
const results = await mcpBacktestFn(state.strategyCode, state.ticker, state.timeframe);
return {
backtestResults: results,
};
};
// Node: Risk Assessment
const riskAssessmentNode = async (state: StrategyAnalysisStateType) => {
logger.info('Strategy workflow: Risk assessment');
const systemPrompt = `You are a risk management expert for trading strategies.
Analyze the backtest results and provide a risk assessment.
Focus on: drawdown, win rate, Sharpe ratio, position sizing, and risk of ruin.`;
const response = await model.invoke([
new SystemMessage(systemPrompt),
new HumanMessage(
`Code review: ${state.codeReview}\n\nBacktest results: ${JSON.stringify(state.backtestResults, null, 2)}\n\nProvide risk assessment:`
),
]);
return {
riskAssessment: response.content as string,
};
};
// Node: Human Approval (placeholder - would integrate with UI)
const humanApprovalNode = async (state: StrategyAnalysisStateType) => {
logger.info('Strategy workflow: Awaiting human approval');
// In real implementation, this would pause and wait for user input
// For now, auto-approve
return {
humanApproved: true,
};
};
// Node: Final Recommendation
const recommendationNode = async (state: StrategyAnalysisStateType) => {
logger.info('Strategy workflow: Generating recommendation');
const systemPrompt = `Provide a final recommendation on whether to deploy this trading strategy.
Summarize the code review, backtest results, and risk assessment.
Give clear go/no-go decision with reasoning.`;
const response = await model.invoke([
new SystemMessage(systemPrompt),
new HumanMessage(
`Code review: ${state.codeReview}\n\nBacktest: ${JSON.stringify(state.backtestResults)}\n\nRisk: ${state.riskAssessment}\n\nApproved: ${state.humanApproved}\n\nYour recommendation:`
),
]);
return {
recommendation: response.content as string,
};
};
// Build graph
const workflow = new StateGraph(StrategyAnalysisState)
.addNode('code_review', codeReviewNode)
.addNode('backtest', backtestNode)
.addNode('risk_assessment', riskAssessmentNode)
.addNode('human_approval', humanApprovalNode)
.addNode('recommendation', recommendationNode)
.addEdge('__start__', 'code_review')
.addEdge('code_review', 'backtest')
.addEdge('backtest', 'risk_assessment')
.addEdge('risk_assessment', 'human_approval')
.addConditionalEdges('human_approval', (state) => {
return state.humanApproved ? 'recommendation' : '__end__';
})
.addEdge('recommendation', '__end__');
return workflow.compile();
}
/**
* Example usage:
*
* const workflow = buildStrategyAnalysisWorkflow(model, logger, mcpBacktestFn);
*
* const result = await workflow.invoke({
* strategyCode: "strategy code here",
* ticker: "BTC/USDT",
* timeframe: "1h",
* });
*
* console.log(result.recommendation);
*/

26
gateway/tsconfig.json Normal file
View File

@@ -0,0 +1,26 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ESNext",
"lib": ["ES2022"],
"moduleResolution": "bundler",
"resolveJsonModule": true,
"allowJs": false,
"outDir": "./dist",
"rootDir": "./src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"declaration": true,
"declarationMap": true,
"sourceMap": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noImplicitReturns": true,
"noFallthroughCasesInSwitch": true,
"allowSyntheticDefaultImports": true
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist"]
}

15
lifecycle-sidecar/.gitignore vendored Normal file
View File

@@ -0,0 +1,15 @@
# Binaries
lifecycle-sidecar
*.exe
*.dll
*.so
*.dylib
# Test binary
*.test
# Go workspace file
go.work
# Build output
dist/

View File

@@ -0,0 +1,40 @@
# Build stage
FROM golang:1.22-alpine AS builder
WORKDIR /app
# Install build dependencies
RUN apk add --no-cache git ca-certificates
# Copy go mod files
COPY go.mod go.sum ./
RUN go mod download
# Copy source
COPY main.go ./
# Build static binary
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-ldflags="-w -s" \
-o lifecycle-sidecar \
main.go
# Runtime stage
FROM alpine:3.19
# Install procps for process monitoring (pgrep, kill)
RUN apk add --no-cache procps ca-certificates
# Create non-root user
RUN addgroup -g 1000 sidecar && \
adduser -D -u 1000 -G sidecar sidecar
WORKDIR /app
# Copy binary from builder
COPY --from=builder /app/lifecycle-sidecar /app/lifecycle-sidecar
# Run as non-root
USER sidecar
ENTRYPOINT ["/app/lifecycle-sidecar"]

View File

@@ -0,0 +1,94 @@
# Lifecycle Sidecar
A lightweight Kubernetes sidecar that monitors the main agent container and handles cleanup when the container exits with a specific exit code indicating idle shutdown.
## Purpose
User agent containers self-manage their lifecycle by:
1. Tracking their own activity (MCP calls, trigger status)
2. Exiting with code `42` when idle (no triggers + no recent activity)
3. Delegating deployment cleanup to this sidecar
The sidecar watches the main container and:
- On exit code `42`: Deletes the deployment (and optionally PVC)
- On any other exit code: Allows Kubernetes restart policy to handle it
## Architecture
```
┌─────────────────────────────────────────────────┐
│ Pod │
│ ┌────────────────┐ ┌──────────────────┐ │
│ │ Agent Container│ │ Lifecycle Sidecar│ │
│ │ │ │ │ │
│ │ - Track activity │ - Monitor agent │ │
│ │ - Track triggers │ - Watch exit code│ │
│ │ - Exit 42 if idle │ - Delete if 42 │ │
│ └────────────────┘ └──────────────────┘ │
│ │ │ │
│ │ writes exit_code │ │
│ └─────────►/var/run/agent/exit_code │
│ │ │
└───────────────────────────────────┼─────────────┘
▼ k8s API
┌──────────────────────┐
│ Delete Deployment │
│ (+ PVC if anonymous)│
└──────────────────────┘
```
## Environment Variables
| Variable | Required | Description |
|----------|----------|-------------|
| `NAMESPACE` | Yes | Kubernetes namespace (injected via downward API) |
| `DEPLOYMENT_NAME` | Yes | Name of the deployment to delete (from pod label) |
| `USER_TYPE` | No | User license tier: `anonymous`, `free`, `paid`, `enterprise` |
| `MAIN_CONTAINER_PID` | No | PID of main container (for precise monitoring) |
## Exit Code Contract
The agent container uses exit codes to signal intent:
| Exit Code | Meaning | Sidecar Action |
|-----------|---------|----------------|
| `42` | Clean idle shutdown | Delete deployment + optional PVC |
| Any other | Error or normal restart | Allow Kubernetes to restart |
## RBAC Requirements
The sidecar requires a ServiceAccount with permission to delete its own deployment:
```yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "delete"]
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "delete"]
```
See `deploy/k8s/base/lifecycle-sidecar-rbac.yaml` for the full RBAC configuration.
## Building
```bash
docker build -t ghcr.io/dexorder/lifecycle-sidecar:latest .
docker push ghcr.io/dexorder/lifecycle-sidecar:latest
```
## Example Usage
See `deploy/k8s/base/agent-deployment-example.yaml` for a complete example of how to configure an agent deployment with the lifecycle sidecar.
## Security Considerations
1. **Self-delete only**: The sidecar can only delete the deployment it's part of (enforced by label matching in admission policy)
2. **Non-privileged**: Runs as non-root user (UID 1000)
3. **Minimal permissions**: Only has `get` and `delete` on deployments/PVCs in the agents namespace
4. **No cross-namespace access**: Scoped to `dexorder-agents` namespace only
5. **Crash-safe**: Only triggers cleanup on exit code 42, never on crashes

16
lifecycle-sidecar/go.mod Normal file
View File

@@ -0,0 +1,16 @@
module github.com/dexorder/lifecycle-sidecar
go 1.22
require (
github.com/rs/zerolog v1.32.0
k8s.io/api v0.29.2
k8s.io/apimachinery v0.29.2
k8s.io/client-go v0.29.2
)
require (
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
golang.org/x/sys v0.17.0 // indirect
)

234
lifecycle-sidecar/main.go Normal file
View File

@@ -0,0 +1,234 @@
package main
import (
"context"
"fmt"
"os"
"os/exec"
"syscall"
"time"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
const (
// Exit code indicating clean idle shutdown
ExitCodeIdleShutdown = 42
// Poll interval for checking main container status
PollInterval = 5 * time.Second
)
func main() {
// Setup logging
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
log.Info().Msg("Lifecycle sidecar starting")
// Get environment configuration
namespace := os.Getenv("NAMESPACE")
deploymentName := os.Getenv("DEPLOYMENT_NAME")
userType := os.Getenv("USER_TYPE")
mainContainerPID := os.Getenv("MAIN_CONTAINER_PID")
if namespace == "" || deploymentName == "" {
log.Fatal().Msg("NAMESPACE and DEPLOYMENT_NAME environment variables are required")
}
log.Info().
Str("namespace", namespace).
Str("deployment", deploymentName).
Str("userType", userType).
Str("mainPID", mainContainerPID).
Msg("Configuration loaded")
// Create Kubernetes client
config, err := rest.InClusterConfig()
if err != nil {
log.Fatal().Err(err).Msg("Failed to get in-cluster config")
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create Kubernetes client")
}
// Wait for main container to exit
exitCode := waitForMainContainer()
log.Info().Int("exitCode", exitCode).Msg("Main container exited")
// Handle exit code
if exitCode == ExitCodeIdleShutdown {
log.Info().Msg("Detected idle shutdown (exit code 42) - cleaning up deployment")
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
// Delete PVC if anonymous user
deletePVC := userType == "anonymous" || userType == "temporary"
if err := cleanupDeployment(ctx, clientset, namespace, deploymentName, deletePVC); err != nil {
log.Error().Err(err).Msg("Failed to cleanup deployment")
os.Exit(1)
}
log.Info().Msg("Cleanup complete - sidecar exiting")
os.Exit(0)
} else {
// Any other exit code - let Kubernetes restart policy handle it
log.Info().
Int("exitCode", exitCode).
Msg("Non-idle exit code - allowing Kubernetes to handle restart")
os.Exit(exitCode)
}
}
// waitForMainContainer monitors the main container process and returns its exit code
func waitForMainContainer() int {
// Try multiple methods to detect main container exit
// Method 1: Poll for process via shared PID namespace
mainPID := os.Getenv("MAIN_CONTAINER_PID")
if mainPID != "" {
return pollProcessExit(mainPID)
}
// Method 2: Poll for agent process by name (fallback)
log.Info().Msg("MAIN_CONTAINER_PID not set, polling for 'agent' process")
return pollProcessByName("agent")
}
// pollProcessExit polls for process exit by PID
func pollProcessExit(pidStr string) int {
log.Info().Str("pid", pidStr).Msg("Monitoring main container process")
for {
// Check if process exists
cmd := exec.Command("kill", "-0", pidStr)
err := cmd.Run()
if err != nil {
// Process no longer exists - get exit code from /proc if available
log.Info().Msg("Main container process exited")
// Try to get actual exit code (this is a best-effort)
// In Kubernetes, we might not have access to the actual exit code
// So we check if the container restarted via container status
return getContainerExitCode()
}
time.Sleep(PollInterval)
}
}
// pollProcessByName polls for process exit by name
func pollProcessByName(name string) int {
log.Info().Str("name", name).Msg("Monitoring main container by name")
for {
cmd := exec.Command("pgrep", "-x", name)
err := cmd.Run()
if err != nil {
log.Info().Msg("Main container process exited")
return getContainerExitCode()
}
time.Sleep(PollInterval)
}
}
// getContainerExitCode attempts to retrieve the exit code of the main container
// This is challenging in Kubernetes without direct access to container runtime
// We use a fallback approach: check a shared file or default to 0
func getContainerExitCode() int {
// Check if main container wrote exit code to shared volume
exitCodeFile := "/var/run/agent/exit_code"
data, err := os.ReadFile(exitCodeFile)
if err == nil {
var exitCode int
_, err := fmt.Sscanf(string(data), "%d", &exitCode)
if err == nil {
log.Info().Int("exitCode", exitCode).Msg("Read exit code from shared file")
return exitCode
}
}
// Default to 0 if we can't determine exit code
// This is safe because non-42 codes allow restart
log.Warn().Msg("Could not determine exit code, defaulting to 0")
return 0
}
// cleanupDeployment deletes the deployment and optionally the PVC
func cleanupDeployment(ctx context.Context, clientset *kubernetes.Clientset, namespace, deploymentName string, deletePVC bool) error {
log.Info().
Str("namespace", namespace).
Str("deployment", deploymentName).
Bool("deletePVC", deletePVC).
Msg("Cleaning up deployment")
// Get deployment to find PVC name if needed
var pvcName string
if deletePVC {
deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{})
if err != nil {
log.Warn().Err(err).Msg("Could not get deployment for PVC lookup")
} else {
// Find PVC from volume claim templates or volumes
if len(deployment.Spec.Template.Spec.Volumes) > 0 {
for _, vol := range deployment.Spec.Template.Spec.Volumes {
if vol.PersistentVolumeClaim != nil {
pvcName = vol.PersistentVolumeClaim.ClaimName
break
}
}
}
}
}
// Delete deployment
deletePolicy := metav1.DeletePropagationForeground
deleteOptions := metav1.DeleteOptions{
PropagationPolicy: &deletePolicy,
}
log.Info().Str("deployment", deploymentName).Msg("Deleting deployment")
err := clientset.AppsV1().Deployments(namespace).Delete(ctx, deploymentName, deleteOptions)
if err != nil {
return fmt.Errorf("failed to delete deployment: %w", err)
}
log.Info().Msg("Deployment deleted successfully")
// Delete PVC if requested and found
if deletePVC && pvcName != "" {
log.Info().Str("pvc", pvcName).Msg("Deleting PVC")
err := clientset.CoreV1().PersistentVolumeClaims(namespace).Delete(ctx, pvcName, metav1.DeleteOptions{})
if err != nil {
log.Warn().Err(err).Str("pvc", pvcName).Msg("Failed to delete PVC (non-fatal)")
} else {
log.Info().Msg("PVC deleted successfully")
}
}
return nil
}
func init() {
// Register signal handler for graceful shutdown
// If sidecar receives SIGTERM, just exit cleanly
// Don't trigger deployment deletion on sidecar termination
go func() {
sigChan := make(chan os.Signal, 1)
syscall.Signal(syscall.SIGTERM)
<-sigChan
log.Info().Msg("Received SIGTERM - sidecar exiting without cleanup")
os.Exit(0)
}()
}